-
Notifications
You must be signed in to change notification settings - Fork 1
/
checknode
executable file
·573 lines (526 loc) · 30.5 KB
/
checknode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
#!/bin/bash
#
# Boot-time and run-time diagnostics
# Checks for the presence/health of various components
# Undrain node up if it is healthy
#
usage() {
echo "Usage: $0 [-a] [-b] [-c] [-h] [-l] [-r] [-u] [-v]"
echo " -a all mode - run all tests"
echo " -b boot mode - mark the boot sequence as complete"
echo " -c check node only - do not change Slurm state"
echo " -h print this help message"
echo " -l local checks only - avoid network-dependent checks"
echo " -r manually run node screen after failure"
echo " -u force undrain - clear hand-set drain messages"
echo " -v verbose mode"
echo ""
}
trap "rm -f /run/checknode/lock" EXIT
if [[ -e /run/checknode/lock ]]; then
echo "Checknode lock already in place"
logger -t checknode "Checknode lock already in place"
exit 1
fi
ALL=0
BOOT=0
CHECKONLY=0
LOCALONLY=0
ERRORCNT=0
ERRORSTR=''
UNDRAIN=0
VERBOSE=0
MANUALNODESCREEN=0
export SLURM_CONF=/etc/slurm/slurm.conf.min
mkdir -p /run/checknode
mkdir -p /run/checknode/journalcache
touch /run/checknode/lock
echo "running" > /run/checknode/state
echo -n > /run/checknode/log
[ -L /root/checknode_state ] || ln -sf /run/checknode/state /root/checknode_state
[ -L /root/checknode_log ] || ln -sf /run/checknode/log /root/checknode_log
###############################################################################
# Process arguments
###############################################################################
while getopts "abchlruv" option; do
case $option in
a ) ALL=1
LOCALONLY=0
;;
b ) BOOT=1
;;
c ) CHECKONLY=1
;;
l ) LOCALONLY=1
SLURM_OK=0
;;
h ) usage
exit 1
;;
r ) MANUALNODESCREEN=1
;;
u ) UNDRAIN=1
;;
v ) VERBOSE=1
;;
\? ) usage
exit 1
;;
* ) echo "Option $option not understood"
usage
exit 1
;;
esac
done
[ $BOOT -eq 1 ] && touch /run/checknode/booted
if [[ ! -e /run/checknode/booted ]]; then
echo "Node is still booting"
exit 1
fi
if [ -e /home/quarantine/skip_network ]; then
QUARANTINED=$(cat /home/quarantine/skip_network)
MYHOST=$(hostname)
SKIP=$(/opt/clmgr/bin/cluset -f $QUARANTINED -i $MYHOST)
if [[ ! -z "$SKIP" ]]; then
[ $LOCALONLY -eq 0 ] && echo "Skipping checknode, use -l flag" && exit 0
LOCALONLY=1
fi
fi
[ $LOCALONLY -eq 0 ] && ping -c1 -W5 -q slurm1 > /dev/null 2>&1 && SLURM_OK=1 || SLURM_OK=0
# Check for running jobs
scontrol listpids >/dev/null 2>&1 && RUNNINGJOB=1 || RUNNINGJOB=0
# If the node has been booted for less than 15 minutes, do a full check
[ $(awk -F. '{print $1}' /proc/uptime) -lt 900 ] && ALL=1
HOST=$(hostname)
###############################################################################
# Functions
###############################################################################
logstdout() { echo "DIAG: $1" | tee -a /run/checknode/log; logger -- checknode: DIAG: $1; }
logstderr() { echo "DIAG_ERROR: $@" | tee -a /run/checknode/log 1>&2; logger -- checknode: DIAG_ERROR: $1; }
diagerror() { ((ERRORCNT++));logstderr "$@"; [ -z "$ERRORSTR" ] && ERRORSTR="$@"; return 1; }
diaginfo() { logstdout "$1"; return 1; }
compare() { [[ "$1" == "$2" ]] || diagerror "$3 - Got '$1' but expected '$2'"; }
compare_info() { [[ "$1" == "$2" ]] || diaginfo "$3 - Got '$1' but expected '$2'"; }
compare2() { [[ "$1" == "$2" || "$1" == "$3" ]] || diagerror "$4 - Got '$1' but expected '$2' or '$3'"; }
compare3() { [[ "$1" == "$2" || "$1" == "$3" || "$1" == "$4" ]] || diagerror "$5 - Got '$1' but expected '$2', '$3', or '$4'"; }
compare4() { [[ "$1" == "$2" || "$1" == "$3" || "$1" == "$4" || "$1" == "$5" ]] || diagerror "$6 - Got '$1' but expected '$2', '$3', $4, or '$5'"; }
compare_ne() { [[ "$1" != "$2" ]] || diagerror "$3 - Should not get value '$2'"; }
compare_ge() { [[ "$1" -ge "$2" ]] || diagerror "$3 - Got '$1' but expected >= '$2'"; }
compare_le() { [[ "$1" -le "$2" ]] || diagerror "$3 - Got '$1' but expected <= '$2'"; }
compare_re() { [[ "$1" =~ $2 ]] || diagerror "$3 - Expected '$1' to match regex '$2'"; }
compare_nre() { [[ "$1" =~ $2 ]] && diagerror "$3 - Matched '$1' with regex '$2'"; }
checkproc() { pgrep -f $1 >/dev/null || diagerror "$1 is not running"; }
checkkmod() { lsmod|egrep -q "^$1" || diagerror "Kernel module $1 is not loaded"; }
run() { OUT="$($@)"; [ $? -ne 0 ] && diagerror $OUT ; }
verbose() { [ $VERBOSE -eq 1 ] && echo checknode: $(date -Iseconds) - $@; }
startslurmd() { pgrep slurmd >/dev/null 2>&1 || ([ $SLURM_OK -eq 1 ] && systemctl start slurmd && sleep 35) }
stopslurmd() { pgrep slurmd >/dev/null 2>&1 && systemctl stop slurmd; }
readslurmstate() { [ $SLURM_OK -eq 1 ] && read -t 5 CURRENTSTATE FULLSTATE REASONUSER CURRENTREASON < <(sinfo -n $(hostname -s) -N --local -O statecompact:20,statecomplete:50,user,reason:150 --noheader |head -1) || CURRENTSTATE="unknown"; }
# journalgrep looks for lines in the journal, caching results
# Expects arguments:
# $1 a name to store the cache as
# $2 a 'grep' of what to look for (-g argument to journalctl)
# $3 error to report back to checknode
journalgrep() { CFILE=/run/checknode/journalcache/$1
[ -e $CFILE ] && SINCE="$(stat -c %y $CFILE| sed 's/\..*//')" || SINCE="-1y"
touch $CFILE
journalctl -q -g "$2" --since "$SINCE" >> $CFILE
[ $(cat $CFILE | wc -l) -gt 0 ] && diagerror "$3"
}
[ $UNDRAIN -eq 1 ] && UD=' with force undrain flag' || UD=''
[ -z "$SUDO_USER" ] && LM='' || LM=" ($SUDO_USER)"
logstdout "Checknode running${UD}${LM}"
[ $RUNNINGJOB -eq 1 ] && [ ! -z "$SUDO_USER" ] && logstderr "A job is running - exiting" && exit 1
[ -e /tmp/.spilock ] && diagerror "SPI lock file is in place $(cat /tmp/.spilock) - node must reboot"
###############################################################################
# Check if nodescreen is currently running
###############################################################################
if [[ -e /run/nodescreen/lock ]]; then
diaginfo "/run/nodescreen/lock in place. Node is currently running nodescreen test due to either a GCD change, expired weekly test, or manual run. This takes <= 15 minutes."
echo "pass" > /run/checknode/state
exit 1
fi
###############################################################################
# Generic checks
###############################################################################
# This saves "early" dmesg to a file in case it rolls later so we don't miss anything
[ -e /run/checknode/dmesg-early ] || dmesg > /run/checknode/dmesg-early
# Check for and save a reboot reason
reboot_reasons="29 mp1_wdtout. This bit will be set to 1 when MP1_Watchdog timer time out.
27 sync_flood. System reset was caused by a SYNC+FLOOD event which was due to an UE error.
26 remoteresetfromasf. System reset was caused by a remote RESET command from ASF.
25 watchdogissuereset. System reset caused y Microsoft WatchDog Timer.
24 failbootrst. System reset was caused by AMD Fail boot timer.
23 shutdown_msg. System reset was caused by a SHUTDOWN command from CPU
22 kb_reset. System reset was caused by assertion of KB_RST_L.
21 sleepreset. Reset status from Sleep state.
19 do_k8_reset. System reset was caused CF9 = 0x06.
18 do_k8_init. System reset was caused by CF9 = 0x04.
17 soft_pcirst. System reset was caused by writing to PMIO.
16 userrst. Last reset was caused by BP_SYS_RST_L assertion.
15 pmeturnofftime. Reset: 0h.
14 pmeturnofftime. Reset: 0h.
9 intthermaltrip. System was shut down due to an internal ThremalTrip event.
4 remotepowerdownfromasf. SOC has received a remote Power Off command from ASF.
2 shutdown. System was shut down due to ShutDown event.
1 pwrbtn4second. System was shut down due to 4s PwrButton event.
0 thermaltrip. System was shut down due to BP_THERMTRIP_L assertion."
if [ ! -f /run/checknode/reboot_reason ]; then
touch /run/checknode/reboot_reason
CODE=$(setpci -s 00:00.0 60.L=0x02D013C0 && setpci -s 00:00.0 64.L)
echo "$reboot_reasons" | while read num rest; do
[ $((0x$CODE & 1 << $num)) -ne 0 ] && echo $rest >> /run/checknode/reboot_reason && logstderr "Reboot reason: $CODE $num $rest"
done
fi
###############################################################################
# Firmware checks
###############################################################################
verbose Checking Firmware
compare2 "$(/usr/sbin/dmidecode -s bios-version)" "1.8.0" "1.9.0" "BIOS version does not match expected version"
compare "$(/usr/sbin/dmidecode |awk '/BIOS Revision:/ {print $3}')" "5.21" "BIOS Revison does not match expected version"
###############################################################################
# Process checks
###############################################################################
verbose Checking processes
checkproc munged
#checkproc postfix
###############################################################################
# Early GPU checks - so the Slurm drain reason becomes this
###############################################################################
grep -q 'Board power calibration failed' /run/checknode/dmesg-early && diagerror 'GPU Board power calibration failed'
journalgrep gpu_doorbell 'queue_doorbell_id0 is not 0, Queue preemption time out' 'GPU Queue preemption time out - CHECK NODE STATE BEFORE REBOOTING'
journalgrep ras_poison 'RAS poison consumption' 'amdgpu: RAS poison consumption - Run uts oblex and if another UE is encountered replace the GPU'
journalgrep sq_intr 'amdgpu: sq_intr' 'GPU sq_intr - put in HBM sandbox'
journalgrep uncorrectable_error 'amdgpu: Uncorrectable error detected in UMC' 'GPU UE in UMC - put in HBM sandbox'
###############################################################################
# Stray User Process Check
###############################################################################
verbose Checking user processes
mkdir /sys/fs/cgroup/cpuset/slurm 2>/dev/null
if [[ $RUNNINGJOB -ne 1 ]]; then
# First try to kill any processes under Slurm
STRAYPROCS=0
COUNT=0
until [ -z "$(find /sys/fs/cgroup/cpuset/slurm/ -name tasks -not -path '*system*' -exec cat {} \;)" ];do
[ $COUNT -gt 15 ] && STRAYPROCS=1
[ $COUNT -gt 15 ] && diagerror "Unkillable user processes still present from $(find /sys/fs/cgroup/cpuset/slurm/ -mindepth 2 -maxdepth 2 -type d -exec basename {} \; | tr '\n' ' ')"
[ $COUNT -gt 15 ] && find /sys/fs/cgroup/cpuset/slurm/ -name tasks -not -path '*system*' -exec cat {} \; | sort | uniq | xargs -IHERE /bin/bash -c 'echo "--- Unkillable process $(cat /proc/HERE/comm) pid HERE ---"; cat /proc/HERE/stack' | logger -t unkillable_process
[ $COUNT -gt 15 ] && break
find /sys/fs/cgroup/cpuset/slurm/ -name tasks -not -path '*system*' -exec cat {} \; | xargs --no-run-if-empty kill -9 >/dev/null 2>&1
sleep 2
let COUNT=COUNT+1
done
# Try to remove any stray cpuset directories
[ $STRAYPROCS -eq 0 ] && find /sys/fs/cgroup/cpuset/slurm/ -depth -mindepth 1 -type d -not -name system -delete > /dev/null 2>&1
[ $STRAYPROCS -eq 0 ] && compare "$(find /sys/fs/cgroup/cpuset/slurm/ -mindepth 1 -type d -not -name system -exec basename {} \;| tr '\n' ' ')" "" "Stray cgroup directories"
#compare "$(pgrep slurmstepd | wc -l)" 0 "Hung slurmstepd processes"
fi
pgrep mprime &>/dev/null && diagerror 'mprime is running - leftover stress run'
# Clear any caches here
echo 3 > /proc/sys/vm/drop_caches
###############################################################################
# CPU Checks
###############################################################################
verbose CPU Checks
compare "$(cat /sys/devices/system/cpu/online)" "0-127" "CPU Online Count"
# CPU appears to jump frequencies - re-enable later
#compare "$(lscpu | grep "CPU MHz" | awk '{print $3}')" "2250.000" "CPU Frequency Check"
if [ -f /rocm/cpu_mp1_stuck_chk.py ]; then
/rocm/cpu_mp1_stuck_chk.py >/dev/null || diagerror "Stuck CPU power management firmware"
fi
[ -d /sys/cray/pm_counters ] || diagerror "pm_counters directory not present - possible i2c problem"
###############################################################################
# HSN checks
###############################################################################
verbose Checking HSN Link Speed and AMA
MISSING_IFACE=0
CXIPCI=$(journalctl -q -o short-unix --since="10 hours ago" -g 'cxi_core.*PCIe error: L0 to Recovery Entry')
NOW=$(date +%s)
for interface in hsn0 hsn1 hsn2 hsn3; do
if [ -e /sys/class/net/${interface} ]; then
compare_ne "$(cat /sys/class/net/${interface}/device/fru/serial_number)" "" "Interface $interface serial number blank" || continue
compare "$(cat /sys/class/net/${interface}/device/uc/qspi_blob_version)" "1.5.49-ESM" "FW version for $interface does not match expected version" || continue
compare "$(cat /sys/class/net/${interface}/device/port/link)" "up" "Interface $interface link status" || continue
compare2 "$(cat /sys/class/net/${interface}/device/port/speed)" "BS_200G" "NA" "Interface $interface link speed" || continue
compare "$(cat /sys/class/net/${interface}/device/port/pause)" "pfc/802.1qbb" "Interface $interface pause mode" || continue
compare "$(cat /sys/class/net/${interface}/device/port/link_layer_retry)" "on" "Interface $interface LLR" || continue
compare_re "$(cat /sys/class/net/${interface}/device/current_link_speed)" "16(\.0)? GT/s( PCIe)?$" "Interface $interface link speed" || continue
compare "$(cat /sys/class/net/${interface}/device/current_link_width)" 16 "Interface $interface link width" || continue
compare "$(cat /sys/class/net/${interface}/device/properties/speed)" 200000 "Interface $interface speed" || continue
compare "$(cat /sys/class/net/${interface}/device/properties/current_esm_link_speed)" "25.0 GT/s" "Interface $interface ESM speed" || continue
compare "$(cat /sys/class/net/${interface}/device/properties/link)" 1 "Interface $interface link" || continue
compare "$(cat /sys/class/net/${interface}/addr_assign_type)" 3 "Interface $interface mac mode" || continue
iface_ip=$(ip -brief -4 a ls ${interface} | awk '{print $3}'|sed 's/\/.*//')
expected_mac=$(awk -v IP=$iface_ip '$2 == IP {print $1}' /etc/ethers 2>/dev/null)
[ -z "$expected_mac" ] || compare "$(cat /sys/class/net/${interface}/address)" $expected_mac "Interface $interface AMA" || continue
[ -e /root/${interface}_down_allowed ] || echo 10 > /root/${interface}_down_allowed
compare_le $(echo "$CXIPCI" | awk -F= -viface=$interface 'BEGIN{max=0}$0 ~ iface {if ($2>max) max=$2}END{print max}') 480 "Interface $interface PCIe 1-minute burst"
compare_le $(echo "$CXIPCI" | awk -viface=$interface -vnow=$NOW 'BEGIN{COUNT=0}$6 ~ iface && $1 > now - (60*60*10) {COUNT++}END{print COUNT}') 300 "Interface $interface PCIe 1-minute hits in a 10 hour window"
else
diagerror "Interface ${interface} is not present in /sys/class/net"
MISSING_IFACE=1
fi
done
journalgrep cxi_tct_tbl_dealloc 'C_EC_CRIT: C_PCT_EXT error: tct_tbl_dealloc' 'Interface tct table dealloc'
journalgrep cxi_uncor 'C_EC_UNCOR_NS: mem_ucor_err_cntr' 'Interface uncorrectable errors'
journalgrep cxi_credit_uflow 'C_EC_UNCOR_NS: C1_PCT error: credit_uflw' 'Interface credit underflow (reboot node)'
journalgrep cxi_sensor_crit 'cxi_core.*sensor.*in critical state' 'Cassini sensor in critical state'
dmesg | grep -q spt_tbl_rd_misc_unused && dmesg | grep -q 'C_PCT_ERR_INFO_EXT 0000000021000000' && diagerror 'spt_tbl_rd_misc_unused stuck packets (reboot node)'
# If an interface is missing, the hsn to cxi numbering will be off
# We can just skip all this since the missing iface could impact this
if [ $MISSING_IFACE == 0 ]; then
for cxi in cxi0 cxi1 cxi2 cxi3 ; do
compare "$(systemctl is-active cxi_rh@${cxi})" "active" "Cassini Retry Handler service on $cxi"
if [ -d /sys/class/cxi/$cxi/device/link_restarts ]; then
BURST=0
FLAPS=0
NOW=$(date +%s)
for time in $(cat /sys/class/cxi/$cxi/device/link_restarts/time_* | egrep -v '^0$'); do
if [ $time -ge $((NOW-3600)) ]; then
BURST=$((BURST+1))
FLAPS=$((FLAPS+1))
elif [ $time -ge $((NOW-36000)) ]; then
FLAPS=$((FLAPS+1))
fi
done
compare_le $BURST 3 "Interface $cxi flaps in the last hour"
compare_le $FLAPS 8 "Interface $cxi flaps in the last 10 hours"
# Attempt to clear any stuck services here - in addition to the epilog
/usr/bin/cxi_service list -d $cxi | awk '/ID:/ {svc=$2}; /System Service.*No/ {if (svc > 1) print svc}' | xargs -r -n1 /usr/bin/cxi_service delete -d $cxi -s &> /dev/null
compare "$(/usr/bin/cxi_service list -d $cxi | awk '/ID:/ {svc=$2}; /System Service.*No/ {if (svc > 1) print svc}' | wc -l)" "0" "$cxi stuck services"
fi
done
[ -e /home/cxi_debug/trstest.py ] && run /home/cxi_debug/trstest.py
fi
compare "$(dmesg |egrep -c pfc_fifo_oflw\|pbuf_rd_err)" 0 "pfc_fifo_oflw or pbuf_rd_err (PKTBUF, check switches and reboot node)"
compare_ge $(ip neigh|grep -c PERMANENT) 150000 "Permanent ARP entries"
###############################################################################
# GPU checks
###############################################################################
verbose Checking GPUs
GPUMAP=(62663 60925 35483 58294 39007 61770 44563 34574)
RV=$(basename $(readlink /opt/rocm-default))
if [ ${RV:5:1} -ge 6 ]; then
GPUMAP=(29312 27578 3292 26097 7704 30477 10324 329)
fi
PCIMAP=('c1' 'c6' 'c9' 'ce' 'd1' 'd6' 'd9' 'de' )
mkdir -p /run/checknode/agt 2>/dev/null
egrep -q amdgpu.*DED /run/checknode/dmesg-early && diagerror "GPU SRAM ECC Uncorrectable Error (DED) on GPU $(awk '/amdgpu.*DED/ {print $4}' /run/checknode/dmesg-early)"
#grep -v SECURE /run/checknode/dmesg-early | egrep -q amdgpu.*SEC && diagerror "GPU SRAM ECC Correctable Error (SEC)"
grep -Pzq 'type: fatal\n.*Hardware Error.*fru_text: SmnError' /run/checknode/dmesg-early && diagerror "BERT fatal SmnError"
# This broke on borg - need to ask AMD
#if [ ! -e /root/xgmi3_check_links.log ]; then
# /usr/bin/python3 /rocm/check_xgmi3_links/mi200_xgmi3_check_links.py > /root/xgmi3_check_links.log 2>&1
#fi
#compare "$(/usr/bin/wc -l /root/xgmi3_check_links.log | awk '{print $1}')" 0 "xGMI3 boot-time link events (see /root/xgmi3_check_links.log)"
[ -e /rocm/rvs_checknode/xgmi_bandwidth_test.sh ] && [ ! -e /run/checknode/xgmi_bandwidth_test.out ] && /rocm/rvs_checknode/xgmi_bandwidth_test.sh
if [ -e /run/checknode/xgmi_bandwidth_test.out ]; then
grep -q FAIL /run/checknode/xgmi_bandwidth_test.out && diagerror "xGMI bandwidth test failed - see /run/checknode/xgmi_bandwidth_test.out"
fi
if [ ! -e /root/rvs_dgemm_perf.log ]; then
/opt/rocm-default/bin/rvs -c /root/rvs_dgemm_perf.conf > /root/rvs_dgemm_perf.log
sed 's/.*]//' /root/rvs_dgemm_perf.log | awk '/Target/ {print "RVS dgemm_perf gpuid "$2" Gflops "$4}' > /root/rvs_gpu_serial.log
for gpu_num in {0..7}; do
pci=0000:${PCIMAP[$gpu_num]}:00.0
[ -e /run/checknode/agt/$gpu_num ] || /rocm/agt -i=PCI:$pci -i2cflrm:4,100,4,4 > /run/checknode/agt/$gpu_num
fru=$(grep Serial /run/checknode/agt/$gpu_num | awk '{print ""$5""}')
awk -v fru=$fru -v pci=$pci '/'${GPUMAP[$gpu_num]}'/ {print $0 " FRU_SERIAL "fru" PCI_ADDR "pci}' /root/rvs_gpu_serial.log | logger
done
fi
grep -q "Failed to read EEPROM table header" /run/checknode/dmesg-early && diagerror "Failed to read EEPROM table header"
mkdir -p /run/checknode/gpumem 2>/dev/null
for gpuid in {0..7}; do
gpu=card${gpuid}
if [ -e /sys/class/drm/${gpu} ]; then
compare_re "$(cat /sys/class/drm/${gpu}/device/current_link_speed)" "16(\.0)? GT/s( PCIe)?$" "GPU ${gpu} link speed"
compare "$(cat /sys/class/drm/${gpu}/device/current_link_width)" 16 "GPU ${gpu} link width"
compare2 "$(cat /sys/class/drm/${gpu}/device/vbios_version)" "113-D65201-046" "113-D65201-X46" "GPU ${gpu} VBIOS IFWI version"
compare_ge $(cat /sys/class/drm/${gpu}/device/mem_info_vram_total) 68000000000 "GPU ${gpu} Total Memory"
cat /sys/class/drm/${gpu}/device/hwmon/hwmon*/temp1_input > /dev/null 2>&1 || diagerror "GPU ${gpu} unable to read SMU Metrics table"
#compare_ge "$(grep "perf.* ${GPUMAP[$gpuid]} .*Gflops.*Target" /root/rvs_dgemm_perf.log|sed 's/.*perf //'|awk '{printf "%d\n", $3}')" 20500 "GPU ${gpu} DGEMM Gflops"
compare_le "$(wc -l /sys/class/drm/${gpu}/device/ras/gpu_vram_bad_pages | awk '{print $1}')" 257 "GPU ${gpu} HBM UE Retired Pages"
pci=0000:${PCIMAP[$gpu_num]}:00.0
[ -e /run/checknode/agt/$gpuid ] || /rocm/agt -i=PCI:$pci -i2cflrm:4,100,4,4 > /run/checknode/agt/$gpuid
compare "$(awk '/FW version/ {print $6}' /run/checknode/agt/$gpuid)" "3.16" "GPU $gpu RM version"
else
diagerror "GPU ${gpu} is not present in /sys/class/drm"
fi
done
journalgrep gpureset 'amdgpu: GPU reset begin' 'Previous GPU reset may impact performance - please reboot'
# Start of GPU Used Memory checks
AMDGPUPIDS=$(ps -edalf | grep amdgpu | grep -v umc_page_retire | grep -v amdgpu-reset-hi | grep -v grep |wc -l)
[ -z "$SLURM_JOB_ID" ] && WAIT=14 || WAIT=120
SECONDS=1
for gpuid in {0..7}; do
CHECKED=0
FIRSTRUN=0
gpu=card${gpuid}
[ ! -e /sys/class/drm/${gpu} ] && continue
[ -e /run/checknode/gpumem/$gpu ] && OLDVAL=$(cat /run/checknode/gpumem/$gpu) || FIRSTRUN=1
while [[ $SECONDS -le $WAIT || $CHECKED -eq 0 ]] ; do
NEWVAL=$(cat /sys/class/drm/${gpu}/device/mem_info_vram_used)
[[ $NEWVAL -le 20000000 ]] && break
CHECKED=1
if [ $SECONDS -ge $((WAIT-2)) ] ; then
AMDGPUPIDS=$(ps -edalf | grep amdgpu | grep -v umc_page_retire | grep -v amdgpu-reset-hi | grep -v grep |wc -l)
compare_le $NEWVAL 20000000 "GPU ${gpu} Used Memory with $AMDGPUPIDS amdgpu kworker procs"
/opt/rocm-default/bin/rocm-smi --showpids | logger -t AMD_GPU_USED_MEM_DEBUG
ps aux | grep -i kfd | grep -v grep | logger -t AMD_GPU_USED_MEM_DEBUG
break
fi
sleep 0.2
done
echo $NEWVAL > /run/checknode/gpumem/$gpu
if [ $FIRSTRUN -eq 1 ]; then
MEMDIFF="INITIAL RUN"
elif [ $NEWVAL -gt $OLDVAL ]; then
MEMDIFF="INCREASE of $((NEWVAL-OLDVAL))"
elif [ $OLDVAL -lt $NEWVAL ]; then
MEMDIFF="DECREASE of $((OLDVAL-NEWVAL))"
else
MEMDIFF="UNCHANGED"
fi
AMDGPUPIDS=$(ps -edalf | grep amdgpu | grep -v umc_page_retire | grep -v amdgpu-reset-hi | grep -v grep |wc -l)
logger -t gpumem -- GPU $gpu memory - current $NEWVAL - $MEMDIFF - amdgpu kworker $AMDGPUPIDS - jobid ${SLURM_JOB_ID-none}
done
[[ $SECONDS -gt 10 ]] && logger -t gpumem "GPU memory check took ${SECONDS} seconds"
# End of GPU Used Memory checks
###############################################################################
# Boot Error Checks
###############################################################################
# None on EX at the moment
###############################################################################
# NVME Health Checks
###############################################################################
verbose NVME Health Checks
for nvme in nvme0 nvme1 ; do
if [ -e /sys/class/nvme/${nvme} ]; then
[ -e /dev/${nvme}n1 ] || diagerror "NVME namespace ${nvme}n1 does not exist"
compare "$(cat /sys/class/nvme/${nvme}/model|xargs)" "SAMSUNG MZ1L21T9HCLS-00A07" "${nvme} model"
compare2 "$(cat /sys/class/nvme/${nvme}/firmware_rev)" "GDC7302Q" "GDC7402Q" "${nvme} FW"
compare $(cat /sys/class/nvme/${nvme}/state 2>/dev/null) live "${nvme} state"
compare $((0x$(/sbin/setpci -s $(basename $(readlink /sys/class/nvme/${nvme}/device)) CAP_EXP+0x12.w) & 0xf )) 4 "${nvme} PCIe Gen"
compare $(((0x$(/sbin/setpci -s $(basename $(readlink /sys/class/nvme/${nvme}/device)) CAP_EXP+0x12.w) & 0x3f0) >> 4 )) 4 "${nvme} PCIe Width"
SL=$(/usr/sbin/nvme smart-log /dev/${nvme} -o json)
compare "$(echo $SL | jq .critical_warning)" 0 "${nvme} critical warning"
compare_ge "$(echo $SL | jq .avail_spare)" 20 "${nvme} available spare"
else
diagerror "NVME ${nvme} device check failed."
fi
done
LVM_SUPPRESS_FD_WARNINGS=1
/sbin/vgdisplay nvme > /dev/null 2>&1 || diagerror "NVME volume group error"
###############################################################################
# File System Checks
###############################################################################
verbose Checking file systems
if [ $LOCALONLY -eq 0 ]; then
/sbin/lvdisplay nvme/persistent &>/dev/null || diagerror "NVME Volume Group persistent does not exist (check for stale bb lv)"
/usr/bin/mountpoint -q /mnt/persistent || diagerror "NVME /mnt/persistent is not mounted"
compare "$(awk '$2 == "/autofs/nccs-svm1_home" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_home mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_home > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_home mount is not healthy"
compare "$(awk '$2 == "/autofs/nccs-svm1_home1" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_home1 mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_home1 > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_home1 mount is not healthy"
compare "$(awk '$2 == "/autofs/nccs-svm1_home2" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_home2 mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_home2 > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_home2 mount is not healthy"
compare "$(awk '$2 == "/autofs/nccs-svm1_proj" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_proj mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_proj > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_proj mount is not healthy"
compare "$(awk '$2 == "/autofs/nccs-svm1_sys" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_sys mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_sys > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_sys mount is not healthy"
compare "$(awk '$2 == "/autofs/nccs-svm1_sw" {print $3}' /proc/self/mounts)" dvs "/autofs/nccs-svm1_sw mount is incorrect"
timeout -k 5 30 stat /autofs/nccs-svm1_sw > /dev/null 2>&1 || diagerror "/autofs/nccs-svm1_sw mount is not healthy"
timeout -k 5 15 ls -d /sw > /dev/null 2>&1 || diagerror "NFS (/sw) is not mounted or hung"
compare "$(awk '$2 == "/lustre/orion" {print $3}' /proc/self/mounts)" lustre "Lustre (/lustre/orion) is not mounted"
timeout -k 5 60 df >/dev/null 2>&1 || diagerror "df error or 60 second timeout"
fi
LNET_NIS=$(awk '/kfi/ {print $1}' /sys/kernel/debug/lnet/nis | uniq)
for iface in hsn{0..3}; do
NIDHEX=$(cat /sys/class/net/${iface}/address | sed 's/^02//' | sed 's/://g' | sed 's/^0*//')
echo "$LNET_NIS" | grep -q $(printf '%d' "0x"$NIDHEX)"@kfi" || diagerror "LNET NI for ${iface} missing"
done
# Check for processes stuck in IO wait - give a couple retries
COUNT=0
until [ "$(ps axo stat|grep -c D)" == "0" ]; do
[ $COUNT -gt 10 ] && compare "$(ps axo stat|grep -c D)" 0 "Processes stuck in IO Wait (D)"
[ $COUNT -gt 10 ] && logstderr "$(ps axo stat,user,comm | egrep '^D')"
[ $COUNT -gt 10 ] && break
sleep 1
let COUNT=COUNT+1
done
###############################################################################
# Host Memory Checks
###############################################################################
verbose Checking memory
find /dev/shm -depth -mindepth 1 -not -name alps -delete > /dev/null 2>&1
compare_ge $(awk '/MemTotal/ {print $2}' /proc/meminfo) 520000000 "Total memory"
compare_ge $(awk '/MemAvailable/ {print $2}' /proc/meminfo) 460000000 "Available memory"
compare "$(awk '$2 == "/dev/hugepages" {print $3}' /proc/self/mounts)" "hugetlbfs" "/dev/hugepages is not mounted"
DMIMEM=$(/usr/sbin/dmidecode --type memory)
compare $(echo "$DMIMEM" | grep Manufacturer: | sort -u | wc -l) 1 "Number of memory manufacturers"
compare $(echo "$DMIMEM" | egrep ^[[:space:]]Size: | sort -u | wc -l) 1 "DIMM sizes"
compare $(echo "$DMIMEM" | egrep ^[[:space:]]Speed: | sort -u | wc -l) 1 "DIMM speeds"
compare $(echo "$DMIMEM" | awk '/Number Of Devices/ {print $4}') 8 "Count of DIMMs"
###############################################################################
# State Updates
###############################################################################
if [ $ERRORCNT -gt 0 ]; then
readslurmstate
echo "fail" > /run/checknode/state
ERRORSTR2="checknode($ERRORCNT) $(echo "$ERRORSTR"|tr '\n' ';')"
if [[ $CHECKONLY -eq 1 ]]; then
logstderr "Node is unhealthy - checkonly mode"
elif [[ "$CURRENTREASON" == "$ERRORSTR2" ]]; then
logstderr "Node is unhealthy - reason unchanged"
elif [[ "$CURRENTREASON" != "checknode"* &&
"$CURRENTREASON" != "Kill task failed" &&
"$CURRENTREASON" != "Not responding" &&
"$CURRENTREASON" != "Prolog error" &&
"$CURRENTREASON" != "Epilog error" &&
"$CURRENTREASON" != "SPI job"* &&
"$CURRENTREASON" != "switch_g_job_postfini failed" &&
"$CURRENTREASON" != "none" &&
"$CURRENTREASON" != ""
]]; then
logstderr "Node is unhealthy - not changing existing reason: $CURRENTREASON"
else
[ $SLURM_OK -eq 1 ] && logstderr "Node is unhealthy - marking drain"
[ $SLURM_OK -eq 1 ] && scontrol --local update node=$(hostname) state=drain reason="$ERRORSTR2" > /dev/null
fi
if [[ "$ERRORSTR" == *"hsn"* ]] ; then
logstderr "Stopping slurmd due to hsn errors"
stopslurmd
else
startslurmd
fi
exit 1
fi
# If this is initial startup, slurmd might not be running
startslurmd
readslurmstate
# If we made it here, the node is healthy
echo "pass" > /run/checknode/state
# If there's an existing comment in checkonly mode (not rebooting), leave the node down
if [[ $CHECKONLY -eq 1 ||
"$CURRENTSTATE" == "idle" ||
"$CURRENTSTATE" == "plnd" ||
"$CURRENTSTATE" == "maint" ||
"$CURRENTSTATE" == "resv"
]]; then
logstdout "Node is healthy - not changing state from $CURRENTSTATE"
elif [[ "$CURRENTREASON" == "Node unexpectedly rebooted" && $UNDRAIN -eq 0 ]]; then
logstdout "Node is healthy - NOT clearing unexpected reboot - use 'checknode -u' to clear IF you understand the reboot reason"
elif [[ "$CURRENTREASON" != "checknode"* &&
"$CURRENTREASON" != "Kill task failed" &&
"$CURRENTREASON" != "Not responding" &&
"$CURRENTREASON" != "SPI job"* &&
"$CURRENTREASON" != "switch_g_job_postfini failed" &&
"$CURRENTREASON" != "none" &&
$UNDRAIN -eq 0
]]; then
logstderr "Node is healthy - not changing existing reason: '$CURRENTREASON' - use 'checknode -u' to clear IF you know it is safe to clear"
else
[ $SLURM_OK -eq 1 ] && logstdout "Node is healthy - marking online" || echo "Node is healthy - not contacting slurm"
[ $SLURM_OK -eq 1 ] && scontrol --local update node=$(hostname) state=idle > /dev/null
fi
exit 0
# vim: ai:ts=2:sw=2:syn=sh