Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
ea2f9e5
Remove some old stuff
natefoo Nov 3, 2022
281e449
Add ask.galaxyproject.org
natefoo Nov 4, 2022
438eb22
ask: rewrite / to /docs
natefoo Nov 4, 2022
98b3ace
TACC local accounts for if there are LDAP troubles
natefoo Nov 4, 2022
44de788
Switch TACC/JS2 to use standard Slurm ports
natefoo Nov 8, 2022
968076a
Update RealMemory for fluctuating JS2 instance memory
natefoo Nov 8, 2022
a75b4bc
Add some large nodes to JS2 TPV partition
natefoo Nov 10, 2022
e83ba63
Remove gxy.io zones and nginx configs
natefoo Nov 10, 2022
3809f29
Add xl instances to tpv partition on JS2
natefoo Nov 10, 2022
b06e6cc
Update access list
natefoo Nov 17, 2022
28456de
Update key
natefoo Nov 17, 2022
acc2eb6
Increase default telegraf interval from 10s to 60s
natefoo Dec 20, 2022
cacfa01
Monitor CVMFS stratum 0 and 1 servers, and JS2 squid proxy with teleg…
natefoo Dec 20, 2022
73952da
Use a non-repo-specific check for CVMFS servers
natefoo Dec 20, 2022
64a2ea9
Decrease memory used by squid on js2 controller to 4 GB
natefoo Dec 20, 2022
689acab
Check CVMFS repos
natefoo Dec 20, 2022
9883d1f
Prune CVMFS snapshots
natefoo Dec 22, 2022
6170aeb
GTN:GA4GH TRS endpoint support rewrite
hexylena Feb 9, 2023
a89bf11
NFS mount changes I forgot to commit
natefoo Feb 20, 2023
4539d2b
Team access list
natefoo Feb 20, 2023
c652824
Merge pull request #39 from hexylena/ga4gh-rewrite
natefoo Mar 1, 2023
194ce7b
Fix backref
natefoo Mar 1, 2023
1c02ceb
Update dj-wasabi.telegraf
natefoo Mar 13, 2023
af2e066
Automatic updates on galaxy hosts
natefoo Mar 13, 2023
de38230
Ensure yum-cron updates are actually applied
natefoo Mar 27, 2023
f95eaae
Don't use non-functional "security" command for yum-cron on CentOS 7.
natefoo Apr 4, 2023
85ff0f3
Add tool popularity query to telegraf
natefoo Apr 12, 2023
06513de
Update access
natefoo Apr 12, 2023
342cca8
add my.galaxy.training links
hexylena Apr 18, 2023
1138c1b
Merge pull request #40 from hexylena/my
natefoo Apr 18, 2023
9627b4e
Deploy my.galaxy.training
natefoo Apr 18, 2023
a15a4d5
fix my stupid mistake
hexylena Apr 19, 2023
25bea08
Merge pull request #41 from hexylena/my
natefoo Apr 19, 2023
4aa6894
Switch test CVMFS repo to common master key
natefoo May 12, 2023
296bcce
Merge branch 'main' of github.com:galaxyproject/infrastructure-playbo…
natefoo May 12, 2023
e979a24
Install python39-devel on JS2 controller for pycurl
natefoo May 15, 2023
26e023b
Drop ELRepo/kernel-ml setup on JS2 now that we're just using fuse-ceph
natefoo May 15, 2023
0c4d143
Additional JS2 node image fixes
natefoo May 15, 2023
9b46fa0
Add additional GPU nodes and make them exclusive
natefoo May 15, 2023
37d05b8
Update gxadmin
natefoo May 22, 2023
a6f4611
Automatically clean Sentry events
natefoo Jun 6, 2023
c5a9159
Disable generation of gxit image for now
natefoo Jun 9, 2023
186f194
Add IDC user to JS2 image
natefoo Jun 9, 2023
ed53224
Fix JS2 tiny memory again
natefoo Jun 9, 2023
3558f7b
Create a redirect loop to usegalaxy.eu
kysrpex Jun 13, 2023
fc12eef
Fix dates on comment about redirection loop
kysrpex Jun 13, 2023
9c10216
Merge pull request #43 from kysrpex/usegalaxu-eu-redirect-fix
natefoo Jun 13, 2023
cb17a06
experimental webfinger support
hexylena Jun 14, 2023
af2d262
Merge pull request #44 from hexylena/fedi
natefoo Jun 28, 2023
17a1e20
Drop references to geerlingguy.repo-epel
natefoo Jun 28, 2023
27fbb33
Add IDC CVMFS repo
natefoo Jul 6, 2023
8ebbf31
Add IDC config for TACC stratum1
natefoo Jul 6, 2023
c467d55
Serve IDC data on datacache
natefoo Jul 8, 2023
b0a6c9b
Track idc repo in influx
natefoo Jul 9, 2023
c84703b
Add cache dir for Test
natefoo Aug 15, 2023
c1ee8cc
Update JS2 access
natefoo Aug 16, 2023
13c2c26
EL8 fixes
natefoo Sep 6, 2023
59ac159
Add galaxy-vgp TACC VM
natefoo Sep 6, 2023
45b9e92
Correct mount
natefoo Sep 6, 2023
14a05d0
Add vgp partition on JS2
natefoo Sep 13, 2023
c801ec2
Increase stats memory to 12G
natefoo Oct 2, 2023
4e378ab
Update tmpwatch-auto to exclude running jobs and include sockets
natefoo Oct 2, 2023
e23b2ec
Create a large enough dir on cvmfs0-psu0 for IDC temp operations
natefoo Oct 3, 2023
bffca1a
Use individual ssh keys for tacc root
natefoo Oct 3, 2023
a0f8f34
Stop collecting queue overview in influx
natefoo Oct 19, 2023
e123af7
Add JS2 large mem nodes
natefoo Oct 23, 2023
baa4ddb
Add some more smaller instances to tpv partition on JS2
natefoo Oct 25, 2023
0ed9736
Drop JS2 xl partition and move remaining xl instances to tpv parititon
natefoo Oct 25, 2023
30935b2
Assign JS2 node weights to prefer smaller instances
natefoo Oct 26, 2023
7751765
Add more JS2 large and xl instances for TPV
natefoo Oct 30, 2023
602ec83
Decrease roundup oversubscribe
natefoo Nov 1, 2023
fd0402c
Well that force killed all running jobs because Low socket*core*threa…
natefoo Nov 1, 2023
e71d400
Use MaxCPUsPerNode instead
natefoo Nov 2, 2023
1d4da3c
Set `CVMFS_QUOTA_LIMIT` on JS2 instances to 60% of free space on boot
natefoo Nov 2, 2023
383ebae
Add a mutex to tmpwatch-auto to prevent parallel execution
natefoo Nov 8, 2023
ec435af
Track Js2 tmpwatch-auto mutex age in InfluxDB for alerting
natefoo Nov 8, 2023
1d325ae
Make mutex path readable
natefoo Nov 8, 2023
5fc547b
Upgrade JS2 controller to EL9
natefoo Nov 10, 2023
0b1cd99
JS2 image build fixes
natefoo Nov 10, 2023
9cb8267
More JS2 SlurmScale and image build fixes
natefoo Nov 10, 2023
899d9b5
Cgroups fixes for JS2 EL9
natefoo Nov 10, 2023
a9e30f5
Move amqp consume tmpwatch jobs for JS2 from usegalaxy-playbook
natefoo Nov 10, 2023
b7a6c93
Dump JS2 Pulsar open FD count into influx
natefoo Nov 14, 2023
2a4acdb
Rebalance JS2 instance types and drop old partitions
natefoo Nov 14, 2023
a3bc72d
Add a JS2 partition for NCBI FCS GX
natefoo Nov 14, 2023
8b4eb92
Slurm prolog updates (and drop epilog) for resize-shm
natefoo Nov 15, 2023
8311da5
Updated internal DNS name for cvmfs1-iu0 (for the record, it's
natefoo Nov 15, 2023
adde9ab
Move JS2 prolog to /etc/slurm and copy it in the slurmscale resume pl…
natefoo Nov 17, 2023
4700250
Rebalance JS2 instances
natefoo Nov 20, 2023
0a717d8
Deploy the user/mount prolog from TACC on JS2 for scratch volume issues
natefoo Nov 22, 2023
cdd7c92
Stop JS2 instances before terminating them, for Ceph
natefoo Nov 25, 2023
4b0042c
Run both prolog scripts on JS2
natefoo Nov 29, 2023
b9c3bd5
Use fuse.ceph on JS2 controller
natefoo Dec 7, 2023
9da1722
Switch all nodes to fuse.ceph
natefoo Dec 7, 2023
14fa065
Disable tmpwatch-auto
natefoo Dec 7, 2023
a8ffa0c
Rotate httpd/squid logs for CVMFS more aggressively to prevent disk from
natefoo Dec 18, 2023
75933d9
Update slurm scale tailscale key
natefoo Dec 23, 2023
1c9754a
Rename reserved partition to priorty and add one for JS2
natefoo Jan 10, 2024
5c98350
Increase JS2 instances for PAG workshop
natefoo Jan 12, 2024
780fc24
Update slurp for partition name change
natefoo Jan 12, 2024
6702e0b
Reenable nohold_on_prolog_fail on JS2
natefoo Jan 12, 2024
971e548
Don't attempt to connect JS2 slurm to accounting
natefoo Feb 9, 2024
18e642c
Properly deploy JS2 scratch cleanup and Pulsar FDs scripts
natefoo Feb 20, 2024
9e52fcf
Relaunch JS2 controller as volume-backed instance
natefoo Feb 20, 2024
4e01ab9
Deploy a separate Pulsar server instance for VGP on JS2
natefoo Mar 13, 2024
5f199b0
Update io -> diskio module for telegraf
natefoo Mar 14, 2024
0f91271
Ignore telegraf_cvmfs
natefoo Mar 15, 2024
ddde95f
Updates to splunk configs and certs
natefoo Mar 15, 2024
483eb73
Add role for MS Defender (mdatp)
natefoo Mar 18, 2024
4808b53
Shift to more usage of JS2 Large Memory instances
natefoo Mar 19, 2024
3fbffdd
updaterepo.sh: support storing Singularity images unpacked
natefoo Mar 19, 2024
383ed5e
Remove debugging commenting of open transaction
natefoo Mar 19, 2024
f5c0b84
Minor updaterepo corrections
natefoo Mar 20, 2024
1584989
Use unpacked Singularity images
natefoo Mar 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ roles/galaxyproject.opendkim
roles/galaxyproject.postfix
roles/galaxyproject.postgresql
roles/galaxyproject.slurm
roles/galaxyproject.telegraf_cvmfs
roles/geerlingguy.repo-epel
roles/usegalaxy_eu.certbot
roles/galaxyproject.gxadmin
Expand Down
178 changes: 178 additions & 0 deletions env/common/files/admin/pulsar-clean-jobs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/bin/bash
##
## This file is maintained by Ansible - CHANGES WILL BE OVERWRITTEN
##
set -euo pipefail

PERCENT=
WARN_HOURS=72
DRY_RUN=false
MUTEX="$HOME/pulsar-clean-jobs.lock"
MUTEX_ACQUIRED=false
declare -a ACTIVE_IDS
ACTIVE_IDS_SET=false

function usage() {
echo "usage: $0 [-hn] DIR"
if [ "${1:-}" = 'help' ]; then
cat <<EOF
-h print help
-n dry run
DIR path of pulsar jobs dir to clean
EOF
fi
}

while getopts ":hn" opt; do
case "$opt" in
h)
usage help
exit 0
;;
n)
DRY_RUN=true
;;
esac
done
shift $((OPTIND-1))

DIR="${1:-}"
[ -n "$DIR" ] || { usage; exit 2; }

# FIXME: JS2-specific hack
case "$DIR" in
*/main/*)
PULSAR_PERSISTED_DATA='/srv/pulsar/main/var/persisted_data'
;;
*/test/*)
PULSAR_PERSISTED_DATA='/srv/pulsar/test/var/persisted_data'
;;
esac


function trap_handler() {
$MUTEX_ACQUIRED && rmdir "$MUTEX"
return 0
}
trap "trap_handler" SIGTERM SIGINT ERR EXIT


function isintorempty() {
# if the value referenced by the var *name* passed in $1 is:
# an integer: return 0
# empty: return 1
# not an integer: return 2
local name=${2:-"\$$1"}
[ -n "${!1}" ] || return 1
[ "${!1}" -eq "${!1}" ] 2>/dev/null || { rc=$?; [ $rc -ne 2 ] || { echo "ERROR: invalid non-integer value for $name: ${!1}"; }; return $rc; }
return 0
}


function isint() {
local name=${2:-"\$$1"}
if isintorempty $1 $name; then
return 0
else
[ $? -ne 1 ] || echo "ERROR: $name value cannot be empty string"
return 1
fi
}


function log() {
[ -t 0 ] && echo -e '\033[1;32mINFO: ' "$@" '\033[0m' || echo 'INFO:' "$@"
}


function log_warning() {
[ -t 0 ] && echo -e '\033[1;33mWARNING:' "$@" '\033[0m' || echo 'WARNING:' "$@"
}


function log_error() {
[ -t 0 ] && echo -e '\033[1;31mERROR:' "$@" '\033[0m' || echo 'ERROR:' "$@" >&2
}


function error() {
log_error "$@"
exit 1
}


function percent_full() {
df --output=pcent "$DIR" | tail -n 1 | tr -d %' '
}


function hours() {
isint PERCENT || exit 1
if [[ $PERCENT -ge 98 ]]; then
echo 1
elif [[ $PERCENT -ge 95 ]]; then
echo 6
elif [[ $PERCENT -ge 90 ]]; then
echo 12
elif [[ $PERCENT -ge 80 ]]; then
echo 24
elif [[ $PERCENT -ge 70 ]]; then
echo 36
elif [[ $PERCENT -ge 60 ]]; then
echo 48
elif [[ $PERCENT -ge 40 ]]; then
echo 72
else
echo 96
fi
}


function is_active() {
local id="$1"
local active_id
if ! $ACTIVE_IDS_SET; then
ACTIVE_IDS=($(find ${PULSAR_PERSISTED_DATA}/*jetstream2-{active,preprocessing}-jobs -maxdepth 1 -type f -printf "%f\n"))
ACTIVE_IDS_SET=true
fi
for active_id in "${ACTIVE_IDS[@]}"; do
if [[ $active_id == $id ]]; then
return 0
fi
done
return 1
}


function clean() {
PERCENT=$(percent_full)
local ctime id dir
local hours=$(hours)
local min_ctime=$(date --date="$hours hours ago" '+%s')
local now=$(date '+%s')
log "at $(date), $DIR is ${PERCENT}% full; scratch dirs older than $(date --date="$hours hours ago") ($hours hours) will be removed"
while read ctime id; do
if is_active "$id"; then
log "ignoring active: $id"
if [[ $(($now - $ctime)) -gt $(($WARN_HOURS * 60 * 60)) ]]; then
log_warning "supposedly active job dir older than $WARN_HOURS hours: $id"
fi
else
dir="${DIR}/${id}"
log "removing: $dir"
$DRY_RUN || rm -rf "$dir"
fi
done < <(ls -lrct --time-style=+%s "$DIR" | tail -n +2 | awk "\$6 < $min_ctime {print \$6, \$7}")
}


function main() {
[ -d "$DIR" ] || error "Invalid directory: $DIR"
if mkdir "$MUTEX"; then
MUTEX_ACQUIRED=true
clean
fi
}


main
39 changes: 37 additions & 2 deletions env/common/files/admin/tmpwatch-auto
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,31 @@ set -euo pipefail

DIR="$1"
PERCENT=
# FIXME: shouldn't be in /tmp, but needs to be readable by telegraf
MUTEX="/tmp/tmpwatch-auto-$(id -un).lock"
MUTEX_ACQUIRED=false
declare -a EXCLUDE_ARGS

case "$DIR" in
*/main/*)
PULSAR_PERSISTED_DATA='/srv/pulsar/main/var/persisted_data'
;;
*/test/*)
PULSAR_PERSISTED_DATA='/srv/pulsar/test/var/persisted_data'
;;
esac


: ${DEBUG:=false}


function trap_handler() {
$MUTEX_ACQUIRED && rmdir "$MUTEX"
return 0
}
trap "trap_handler" SIGTERM SIGINT ERR EXIT


function isintorempty() {
# if the value referenced by the var *name* passed in $1 is:
# an integer: return 0
Expand Down Expand Up @@ -65,20 +86,34 @@ function hours() {
}


function exclude_args() {
local exclude_id
local exclude_ids=($(find ${PULSAR_PERSISTED_DATA}/jetstream2-{active,preprocessing}-jobs -maxdepth 1 -type f -printf "%f\n"))
# this could exceed ARG_MAX but these days it's 2 million so probably not
for exclude_id in "${exclude_ids[@]}"; do
EXCLUDE_ARGS+=(-x "${DIR}/${exclude_id}")
done
}


function clean() {
local hours=$(hours)
local verbose=
exclude_args
$DEBUG && verbose="--verbose"
$DEBUG && set -x
/usr/bin/tmpwatch --mtime --dirmtime --exclude-user=root $verbose "${hours}h" "$DIR"
/usr/bin/tmpwatch --mtime --dirmtime --all --exclude-user=root "${EXCLUDE_ARGS[@]}" $verbose "${hours}h" "$DIR"
{ set +x; } 2>/dev/null
}


function main() {
[ -n "$DIR" ] || { echo "usage: $0 <dir>"; exit 2; }
[ -d "$DIR" ] || error "Invalid directory: $DIR"
clean
if mkdir "$MUTEX"; then
MUTEX_ACQUIRED=true
clean
fi
}


Expand Down
5 changes: 4 additions & 1 deletion env/common/group_vars/baseenv/vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ influxdb_client_pass: "{{ vault_influxdb_client_pass }}"
# run telegraf on everything
telegraf_agent_package_state: latest

# 10s is the default if unset, set `interval = "<seconds>s"` on [agent] or on plugins to override
telegraf_agent_interval: 60

telegraf_agent_output:
- type: influxdb
config:
Expand All @@ -22,7 +25,7 @@ telegraf_plugins_default:
- plugin: disk
- plugin: kernel
- plugin: processes
- plugin: io
- plugin: diskio
- plugin: mem
- plugin: system
- plugin: swap
Expand Down
Loading