From be76391c5513dbb8696f94262fe5f544f9ec4971 Mon Sep 17 00:00:00 2001 From: Taylan Develioglu Date: Thu, 29 Jan 2015 15:55:23 +0100 Subject: [PATCH 1/5] Add support for replay_delay --- check_postgres.pl | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/check_postgres.pl b/check_postgres.pl index 5f78bbb8..32757711 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -1148,6 +1148,7 @@ package check_postgres; hitratio => [0, 'Report if the hit ratio of a database is too low.'], hot_standby_delay => [1, 'Check the replication delay in hot standby setup'], index_size => [0, 'Checks the size of indexes only.'], + replay_delay => [0, 'Check the log replay delay during recovery'], table_size => [0, 'Checks the size of tables only.'], relation_size => [0, 'Checks the size of tables and indexes.'], last_analyze => [0, 'Check the maximum time in seconds since any one table has been analyzed.'], @@ -1752,6 +1753,7 @@ sub finishup { fsm_pages => 'VERSION: 8.2 MAX: 8.3', fsm_relations => 'VERSION: 8.2 MAX: 8.3', hot_standby_delay => 'VERSION: 9.0', + replay_delay => 'VERSION: 9.0', listener => 'MAX: 8.4', ); if ($opt{test}) { @@ -1945,6 +1947,9 @@ sub finishup { ## Check the replication delay in hot standby setup check_hot_standby_delay() if $action eq 'hot_standby_delay'; +## Check the log replay delay during recovery +check_replay_delay() if $action eq 'replay_delay'; + ## Check the maximum transaction age of all connections check_txn_time() if $action eq 'txn_time'; @@ -4743,6 +4748,45 @@ sub check_hitratio { } ## end of check_hitratio +sub check_replay_delay { + ## Check the log replay delay during recovery + ## Supports: Nagios + ## Critical and warning are the thresholds of delay in seconds. + ## Example: --critical=5 + + my ($warning, $critical) = validate_range({type => 'integer', leastone => 1}); + + # We can't assume delay is none if last replayed equals last received, because in + # reality it could mean replication has gone out for lunch. + # This can lead to false negatives on an idle master, but is preferable to the + # opposite where replication has gone away and we assume everything is hunkey-dorey. + # This is also why I renamed this check to replay_delay from hot_standby_delay_slave. + # + # It only tells you the replication delay IF the master is active (i.e. receiving updates). +# $SQL = q{SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() +# THEN 0 +# ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) +# END AS log_delay;}; + $SQL = q{select EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) as log_delay;}; + my $info = run_command($SQL); + + for $db (@{$info->{db}}) { + my $delay = $db->{slurp}[0]->{log_delay}; + my $msg = qq{delay=${delay}s}; + + if (length $critical and $delay > $critical) { + add_critical($msg) + } + elsif (length $warning and $delay > $warning) { + add_warning($msg) + } + else { + add_ok($msg) + } + } + +} ## end of check_replay_delay + sub check_hot_standby_delay { ## Check on the delay in PITR replication between master and slave @@ -8874,6 +8918,12 @@ =head2 B check_hot_standby_delay --dbhost=master,replica1 --warning='1048576 and 2 min' --critical='16777216 and 10 min' +=head2 B + +(C) Returns seconds passed since last transaction replayed +during recovery. This practically tells you the replication delay of a hot standby (locally) IF +the master is active (i.e. master is receiving updates). + =head2 B =head2 B From 221b7d3f9e6cdaa24bb74f82d98f204cea13491e Mon Sep 17 00:00:00 2001 From: Taylan Develioglu Date: Thu, 29 Jan 2015 18:14:33 +0100 Subject: [PATCH 2/5] Throw unknown on empty result set in replay delay --- check_postgres.pl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 32757711..4d8534d6 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -1753,7 +1753,7 @@ sub finishup { fsm_pages => 'VERSION: 8.2 MAX: 8.3', fsm_relations => 'VERSION: 8.2 MAX: 8.3', hot_standby_delay => 'VERSION: 9.0', - replay_delay => 'VERSION: 9.0', + replay_delay => 'VERSION: 9.0', listener => 'MAX: 8.4', ); if ($opt{test}) { @@ -4774,14 +4774,19 @@ sub check_replay_delay { my $delay = $db->{slurp}[0]->{log_delay}; my $msg = qq{delay=${delay}s}; + if (!length $delay) { + add_unknown("received empty result"); + return; + } + if (length $critical and $delay > $critical) { - add_critical($msg) + add_critical $msg; } elsif (length $warning and $delay > $warning) { - add_warning($msg) + add_warning $msg; } else { - add_ok($msg) + add_ok $msg; } } From 99b12fda2dff4addfbca04305674a1683403a80d Mon Sep 17 00:00:00 2001 From: Taylan Develioglu Date: Fri, 30 Jan 2015 10:02:04 +0100 Subject: [PATCH 3/5] (replay_delay) check if server is in recovery first --- check_postgres.pl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 4d8534d6..007a3182 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -4756,6 +4756,17 @@ sub check_replay_delay { my ($warning, $critical) = validate_range({type => 'integer', leastone => 1}); + # check if we are in recovery using pg_is_in_recovery() + $SQL = q{SELECT pg_is_in_recovery() AS recovery;}; + + my $info = run_command($SQL, { regex => qr([tf]) }); + for $db (@{$info->{db}}) { + my $status = $db->{slurp}[0]; + if ($status->{recovery} eq 'f') { + add_critical("not in recovery"); + return; + } + } # We can't assume delay is none if last replayed equals last received, because in # reality it could mean replication has gone out for lunch. # This can lead to false negatives on an idle master, but is preferable to the @@ -4768,17 +4779,12 @@ sub check_replay_delay { # ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) # END AS log_delay;}; $SQL = q{select EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) as log_delay;}; - my $info = run_command($SQL); + $info = run_command($SQL); for $db (@{$info->{db}}) { my $delay = $db->{slurp}[0]->{log_delay}; my $msg = qq{delay=${delay}s}; - if (!length $delay) { - add_unknown("received empty result"); - return; - } - if (length $critical and $delay > $critical) { add_critical $msg; } From 2768df479e646847c4f5ba9d926cc1363176d7f9 Mon Sep 17 00:00:00 2001 From: CJ Estel Date: Fri, 22 Nov 2019 17:03:38 -0500 Subject: [PATCH 4/5] Add max idle master delay tolerated for slave replication delay check --- check_postgres.pl | 61 ++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 007a3182..d53c05a6 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -39,6 +39,9 @@ package check_postgres; ## Which user to connect as if --dbuser is not given $opt{defaultuser} = 'postgres'; +## Number of seconds that slave can go without receiving a write from master before alerting if --maxidlemasterdelay is not given +$opt{defaultidlemasterdelay} = 3600; + ## Which port to connect to if --dbport is not given $opt{defaultport} = 5432; @@ -975,19 +978,20 @@ package check_postgres; 'tempdir=s', 'get_method=s', 'language=s', - 'mrtg=s', ## used by MRTG checks only - 'logfile=s', ## used by check_logfile only - 'queryname=s', ## used by query_runtime only - 'query=s', ## used by custom_query only - 'valtype=s', ## used by custom_query only - 'reverse', ## used by custom_query only - 'repinfo=s', ## used by replicate_row only - 'noidle', ## used by backends only - 'datadir=s', ## used by checkpoint only - 'schema=s@', ## used by slony_status only - 'filter=s@', ## used by same_schema only - 'suffix=s', ## used by same_schema only - 'replace', ## used by same_schema only + 'mrtg=s', ## used by MRTG checks only + 'logfile=s', ## used by check_logfile only + 'queryname=s', ## used by query_runtime only + 'query=s', ## used by custom_query only + 'valtype=s', ## used by custom_query only + 'reverse', ## used by custom_query only + 'repinfo=s', ## used by replicate_row only + 'noidle', ## used by backends only + 'datadir=s', ## used by checkpoint only + 'schema=s@', ## used by slony_status only + 'filter=s@', ## used by same_schema only + 'suffix=s', ## used by same_schema only + 'replace', ## used by same_schema only + 'maxidlemasterdelay=i', ## used by check_replay_delay only ); die $USAGE if ! keys %opt and ! @ARGV; @@ -1018,6 +1022,9 @@ package check_postgres; elsif ($name =~ /^dbservice(\d+)$/o) { push @{ $opt{dbservice} } => $value; } + elsif ($name =~ /^maxidlemasterdelay(\d+)$/o) { + push @{ $opt{maxidlemasterdelay} } => $value; + } else { push @badargs => $arg; } @@ -1208,12 +1215,12 @@ package check_postgres; This is version $VERSION. Common connection options: - -H, --host=NAME hostname(s) to connect to; defaults to none (Unix socket) - -p, --port=NUM port(s) to connect to; defaults to $opt{defaultport}. - -db, --dbname=NAME database name(s) to connect to; defaults to 'postgres' or 'template1' - -u --dbuser=NAME database user(s) to connect as; defaults to '$opt{defaultuser}' - --dbpass=PASS database password(s); use a .pgpass file instead when possible - --dbservice=NAME service name to use inside of pg_service.conf + -H, --host=NAME hostname(s) to connect to; defaults to none (Unix socket) + -p, --port=NUM port(s) to connect to; defaults to $opt{defaultport}. + -db, --dbname=NAME database name(s) to connect to; defaults to 'postgres' or 'template1' + -u --dbuser=NAME database user(s) to connect as; defaults to '$opt{defaultuser}' + --dbpass=PASS database password(s); use a .pgpass file instead when possible + --dbservice=NAME service name to use inside of pg_service.conf Connection options can be grouped: --host=a,b --host=c --port=1234 --port=3344 would connect to a-1234, b-1234, and c-3344 @@ -1225,6 +1232,7 @@ package check_postgres; --exclude=name(s) items to specifically exclude (e.g. tables), depends on the action --includeuser=include objects owned by certain users --excludeuser=exclude objects owned by certain users + --maxidlemasterdelay number of seconds slave can go without receiving a write from master; defaults to '$opt{defaultidlemasterdelay}' Other options: --assume-standby-mode assume that server in continious WAL recovery mode @@ -4756,6 +4764,9 @@ sub check_replay_delay { my ($warning, $critical) = validate_range({type => 'integer', leastone => 1}); + # set max idle master delay to override or the default + my $maxidlemasterdelay = $opt{maxidlemasterdelay} || $opt{defaultidlemasterdelay}; + # check if we are in recovery using pg_is_in_recovery() $SQL = q{SELECT pg_is_in_recovery() AS recovery;}; @@ -4774,11 +4785,13 @@ sub check_replay_delay { # This is also why I renamed this check to replay_delay from hot_standby_delay_slave. # # It only tells you the replication delay IF the master is active (i.e. receiving updates). -# $SQL = q{SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() -# THEN 0 -# ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) -# END AS log_delay;}; - $SQL = q{select EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) as log_delay;}; + # If the master is not active, if it exceeds the threshold specified it will also alert. + $SQL = qq{SELECT CASE + WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() + AND EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) < $maxidlemasterdelay + THEN 0 + ELSE + EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;}; $info = run_command($SQL); for $db (@{$info->{db}}) { From 4b34f3ea4ae017f87adca49f2ff895573ab69ca3 Mon Sep 17 00:00:00 2001 From: CJ Estel Date: Fri, 22 Nov 2019 17:10:02 -0500 Subject: [PATCH 5/5] remove spaces --- check_postgres.pl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index d53c05a6..690cf207 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -1215,12 +1215,12 @@ package check_postgres; This is version $VERSION. Common connection options: - -H, --host=NAME hostname(s) to connect to; defaults to none (Unix socket) - -p, --port=NUM port(s) to connect to; defaults to $opt{defaultport}. - -db, --dbname=NAME database name(s) to connect to; defaults to 'postgres' or 'template1' - -u --dbuser=NAME database user(s) to connect as; defaults to '$opt{defaultuser}' - --dbpass=PASS database password(s); use a .pgpass file instead when possible - --dbservice=NAME service name to use inside of pg_service.conf + -H, --host=NAME hostname(s) to connect to; defaults to none (Unix socket) + -p, --port=NUM port(s) to connect to; defaults to $opt{defaultport}. + -db, --dbname=NAME database name(s) to connect to; defaults to 'postgres' or 'template1' + -u --dbuser=NAME database user(s) to connect as; defaults to '$opt{defaultuser}' + --dbpass=PASS database password(s); use a .pgpass file instead when possible + --dbservice=NAME service name to use inside of pg_service.conf Connection options can be grouped: --host=a,b --host=c --port=1234 --port=3344 would connect to a-1234, b-1234, and c-3344