@@ -24,7 +24,7 @@ CLEARED=0
24
24
JM_WATCHDOG_PID=0
25
25
TM_WATCHDOG_PID=0
26
26
27
- function stop_cluster_and_watchdog () {
27
+ function stop_watchdogs () {
28
28
if [ ${CLEARED} -eq 0 ]; then
29
29
30
30
if ! [ ${JM_WATCHDOG_PID} -eq 0 ]; then
@@ -43,6 +43,15 @@ function stop_cluster_and_watchdog() {
43
43
fi
44
44
}
45
45
46
+ function verify_num_occurences_in_logs() {
47
+ local log_pattern=" $1 "
48
+ local text=" $2 "
49
+ local expected_no=" $3 "
50
+
51
+ local actual_no=$( grep -r --include " *${log_pattern} *.log" -e " ${text} " " ${FLINK_DIR} /log/" | cut -d " :" -f 1 | uniq | wc -l)
52
+ [[ " ${expected_no} " -eq " ${actual_no} " ]]
53
+ }
54
+
46
55
function verify_logs() {
47
56
local OUTPUT=$FLINK_DIR /log/* .out
48
57
local JM_FAILURES=$1
@@ -56,14 +65,14 @@ function verify_logs() {
56
65
fi
57
66
58
67
# checks that all apart from the first JM recover the failed jobgraph.
59
- if ! [ ` grep -r --include ' * standalonesession*.log ' ' Recovered SubmittedJobGraph' " ${FLINK_DIR} /log/ " | cut -d " : " -f 1 | uniq | wc -l ` -eq ${ JM_FAILURES} ] ; then
68
+ if ! verify_num_occurences_in_logs ' standalonesession' ' Recovered SubmittedJobGraph' ${ JM_FAILURES}; then
60
69
echo " FAILURE: A JM did not take over."
61
70
EXIT_CODE=1
62
71
fi
63
72
64
73
if [ " $VERIFY_CHECKPOINTS " = true ]; then
65
74
# search the logs for JMs that log completed checkpoints
66
- if ! [ ` grep -r --include ' * standalonesession*.log ' ' Completed checkpoint' " ${FLINK_DIR} /log/ " | cut -d " : " -f 1 | uniq | wc -l ` -eq $ (( JM_FAILURES + 1 )) ] ; then
75
+ if ! verify_num_occurences_in_logs ' standalonesession' ' Completed checkpoint' $ (( JM_FAILURES + 1 )) ; then
67
76
echo " FAILURE: A JM did not execute the job."
68
77
EXIT_CODE=1
69
78
fi
@@ -77,26 +86,39 @@ function verify_logs() {
77
86
78
87
function jm_watchdog() {
79
88
local EXPECTED_JMS=$1
80
- local IP_PORT =$2
89
+ local PROCESS_NAME =$2
81
90
82
91
while true ; do
83
- local RUNNING_JMS=` jps | grep ' StandaloneSessionClusterEntrypoint ' | wc -l` ;
92
+ local RUNNING_JMS=` jps | grep " ${PROCESS_NAME} " | wc -l` ;
84
93
local MISSING_JMS=$(( EXPECTED_JMS- RUNNING_JMS))
85
94
for (( c= 0 ; c< MISSING_JMS; c++ )) ; do
86
- " $FLINK_DIR " /bin/jobmanager.sh start " localhost " ${IP_PORT }
95
+ ${ @: 3 }
87
96
done
88
97
sleep 1;
89
98
done
90
99
}
91
100
101
+ function start_jm_cmd {
102
+ local IP_PORT=$1
103
+ " $FLINK_DIR /bin/jobmanager.sh" " start" " localhost" " ${IP_PORT} "
104
+ }
105
+
106
+ # ######################################
107
+ # Start watchdog for JM process
108
+
109
+ # Arguments:
110
+ # $1: expected number of jms to run
111
+ # $2: process name to monitor
112
+ # $3: command to start new jm
113
+ # ######################################
92
114
function start_ha_jm_watchdog() {
93
- jm_watchdog $1 $2 &
115
+ jm_watchdog $1 $2 ${ @: 3} &
94
116
JM_WATCHDOG_PID=$!
95
117
echo " Running JM watchdog @ ${JM_WATCHDOG_PID} "
96
118
}
97
119
98
- function kill_jm {
99
- local JM_PIDS=` jps | grep ' StandaloneSessionClusterEntrypoint ' | cut -d " " -f 1`
120
+ function kill_single {
121
+ local JM_PIDS=` jps | grep " $1 " | cut -d " " -f 1`
100
122
local JM_PIDS=(${JM_PIDS[@]} )
101
123
local PID=${JM_PIDS[0]}
102
124
kill -9 ${PID}
0 commit comments