Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/add wal utilization check #178

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 122 additions & 1 deletion check_postgres.pl
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ package check_postgres;
'version-ok' => q{version $1},
'wal-numfound' => q{WAL files found: $1},
'wal-numfound2' => q{WAL "$2" files found: $1},
'wal-amount' => q{WAL data written in past $2: $1},
},

## Spanish
Expand Down Expand Up @@ -1145,6 +1146,7 @@ package check_postgres;
'version-ok' => q{Version $1},
'wal-numfound' => q{WAL-Dateien gefunden: $1},
'wal-numfound2' => q{WAL "$2" Dateien gefunden: $1},
'wal-amount' => q{WAL Geschriebene Daten in den letzten $2: $1},
},

## Persian
Expand Down Expand Up @@ -1719,10 +1721,11 @@ package check_postgres;
'suffix=s', ## used by same_schema only
'replace', ## used by same_schema only
'skipsequencevals', ## used by same_schema only
'lsfunc=s', ## used by wal_files and archive_ready
'lsfunc=s', ## used by wal_files, wal_amount and archive_ready
'object=s@', ## used by same_schema for object types to include
'skipobject=s@', ## used by same_schema for object types to exclude
'skipcycled', ## used by sequence only
'interval=s', ## used by wal_amount only
);

die $USAGE if ! keys %opt and ! @ARGV;
Expand Down Expand Up @@ -1925,6 +1928,7 @@ package check_postgres;
txn_wraparound => [1, 'See how close databases are getting to transaction ID wraparound.'],
version => [1, 'Check for proper Postgres version.'],
wal_files => [1, 'Check the number of WAL files in the pg_xlog directory'],
wal_amount => [1, 'Check the amount of WAL data written within a specified period of time.'],
};

## XXX Need to i18n the above
Expand Down Expand Up @@ -2681,6 +2685,9 @@ sub finishup {
## Check the number of WAL files. warning and critical are numbers
check_wal_files() if $action eq 'wal_files';

## Check the amount of WAL data written in a given time period. warning and critical are sizes
check_wal_amount() if $action eq 'wal_amount';

## Check the number of WAL files ready to archive. warning and critical are numbers
check_archive_ready() if $action eq 'archive_ready';

Expand Down Expand Up @@ -9057,6 +9064,79 @@ sub check_wal_files {

} ## end of check_wal_files


sub check_wal_amount {

## Check the amount of WAL data written within a specified period of time and based on file modification timestamp.

## Critical and warning are the amount of written data
## Warning and critical are bytes
## Valid units: b, k, m, g, t, e
## All above may be written as plural or with a trailing 'b'

## The period under consideration is defined with the option '--interval', the default is one day
## Valid units: s[econd], m[inute], h[our], d[ay]
## All above may be written as plural as well (e.g. "2 hours")

my ($warning, $critical) = ('', '');

# critical and warning states are optional for this check
#
if ((defined($opt{warning}) && length($opt{warning})) || (defined($opt{critical}) && length($opt{critical}))) {
($warning, $critical) = validate_range({type => 'size'});
}

## Determine the time interval - the default is one day
my $interval = $opt{interval} || "1 day";
$interval = size_in_seconds($interval, 'interval');
if ($interval !~ /^[-+]?\d+$/) {
ndie msg('range-int', 'interval');
}
if (! length $interval) {
ndie msg('range-notime', 'interval');
}

my $lsfunc = $opt{lsfunc} || 'pg_ls_dir';
my $lsargs = $opt{lsfunc} ? q{} : "'pg_xlog'";

my $cond = qq{modification >= (now() - '$interval seconds'::interval)};

$SQL = qq{ SELECT COALESCE(SUM(size), 0) AS size
FROM $lsfunc($lsargs) AS filename
INNER JOIN pg_stat_file((SELECT CASE WHEN current_setting('server_version_num')::integer >= 96000 THEN 'pg_wal' ELSE 'pg_xlog' END) || '/' || filename) ON isdir = 'f'
WHERE $cond};
my $SQL10 = $opt{lsfunc} ? $SQL :
qq{SELECT COALESCE(SUM(size), 0) AS size FROM pg_ls_waldir() WHERE $cond};

my $info = run_command($SQL, {regex => qr[\d], version => [">9.6 $SQL10"] });

for $db (@{$info->{db}}) {
my $r = $db->{slurp}[0];
my $size = $r->{size};
if ($MRTG) {
do_mrtg({one => $size});
}

my $msg = msg('wal-amount', pretty_size($size,6), pretty_time($interval));

$db->{perf} .= sprintf '%s=%s;%s;%s',
perfname(msg('size')), $size, $warning, $critical;

if (length $critical and $size > $critical) {
add_critical $msg;
}
elsif (length $warning and $size > $warning) {
add_warning $msg;
}
else {
add_ok $msg;
}
}

return;

} ## end of check_wal_amount

=pod

=encoding utf8
Expand Down Expand Up @@ -10807,6 +10887,47 @@ =head2 B<wal_files>

For MRTG output, reports the number of WAL files on line 1.

=head2 B<wal_amount>

(C<symlink: check_postgres_wal_amount>) Checks how much data was written in a certain period of time to WAL files in the directory F<pg_xlog> (PostgreSQL 10 and later: F<pg_wal>), which can be found
in the B<data_directory>, possibly as a symlink to another physical disk for
performance reasons. If the I<--lsfunc> option is not given then this action must be run as superuser, in order to access the
contents of the F<pg_xlog> directory. The minimum version to use this action is
Postgres 8.1. The I<--warning> and I<--critical> options simply represent the amount of written
data in the F<pg_xlog> directory in I<bytes>, specifying with a unit up to zeta byte is supported.
Both options are optional in case if you just want to monitor the amount.
The I<--interval> option specifies the period of time in I<seconds>, in which changes to the
WAL files are considered, the units s(econds), m(inutes), h(ours),
d(ays), w(eek)s and y(ears) are supported, the default value for this option is "24 hours".

To avoid connecting as a database superuser, a wrapper function around
C<pg_ls_dir()> should be defined as a superuser with SECURITY DEFINER,
and the I<--lsfunc> option used. This example function, if defined by
a superuser, will allow the script to connect as a normal user
I<nagios> with I<--lsfunc=ls_xlog_dir>

BEGIN;
CREATE FUNCTION ls_xlog_dir()
RETURNS SETOF TEXT
AS $$ SELECT pg_ls_dir('pg_xlog') $$
LANGUAGE SQL
SECURITY DEFINER;
REVOKE ALL ON FUNCTION ls_xlog_dir() FROM PUBLIC;
GRANT EXECUTE ON FUNCTION ls_xlog_dir() to nagios;
COMMIT;

Example 1: Check that the size of WAL files written in the last 90 minutes do not exceed 512MB
on host "pluto", using a wrapper function C<ls_xlog_dir> to avoid the need for superuser permissions

check_postgres_wal_amount --host=pluto --critical=512MB --lsfunc=ls_xlog_dir --interval=90m

Example 2: Report the size of WAL files written in the last 5 minutes on the database
connected through the unix socket "/tmp/cptesting_socket" as user "check_postgres_testing"

check_postgres_wal_amount --host=/tmp/cptesting_socket --dbuser=check_postgres_testing --interval=5m

For MRTG output, reports the amount of data written on line 1.

=head2 B<rebuild_symlinks>

=head2 B<rebuild_symlinks_force>
Expand Down
115 changes: 115 additions & 0 deletions t/02_wal_amount.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!perl

## Test the "wal_amount" action

use 5.008;
use strict;
use warnings;
use Data::Dumper;
use Test::More tests => 12;
use lib 't','.';
use CP_Testing;

use vars qw/$dbh $t/;

my $cp = CP_Testing->new({default_action => 'wal_amount'});

# reinitialize the test database to prevent measuring written log data by previous tests
#
$cp->cleanup();

$dbh = $cp->test_database_handle();

my $S = q{Action 'wal_amount'};
my $label = 'POSTGRES_WAL_AMOUNT';

$t=qq{$S fails when called with an invalid option};
like ($cp->run('foobar=12'), qr{Usage:}, $t);

$t=qq{$S fails when called with an invalid option};
like ($cp->run('--warning=30%'), qr{ERROR:.+Invalid size}, $t);
like ($cp->run('--warning=-30'), qr{ERROR:.+Invalid size}, $t);

my $ver = $dbh->{pg_server_version};
if ($ver < 80100) {

$t=qq{$S gives an error when run against an old Postgres version};
like ($cp->run('--warning=99'), qr{ERROR.*server version must be >= 8.1}, $t);

SKIP: {
skip 'Cannot test wal_amount completely on Postgres 8.0 or lower', 7;
}

exit;
}

$cp->drop_schema_if_exists();

$t=qq{$S works as expected for warnings};
like ($cp->run('--warning="100000 GB"'), qr{^$label OK}, $t);
like ($cp->run('--warning=0'), qr{^$label WARNING}, $t);

$t=qq{$S works as expected for criticals};
like ($cp->run('--critical="1 TB"'), qr{^$label OK}, $t);
like ($cp->run('--critical=0'), qr{^$label CRITICAL}, $t);

$cp->set_fake_schema();

# determine the written wal file size in the last hour before inserting test data
#
my $initialWalSize = $cp->run('--interval=15m --output=simple');
chomp($initialWalSize);
$t=qq{$S reported a positive amount of written wal files ($initialWalSize) in the last 15 minutes};
ok ($initialWalSize > 0, $t);

# create a table with simple text contents and insert a set with large (~4*wal segment siz) test data
#
my $walSegmentSize = 16*1024*1024;
$dbh->do(q{DROP TABLE IF EXISTS cptest.randomdata});
$dbh->do(q{CREATE TABLE cptest.randomdata (data TEXT)});
my $randomText = "";
while (length( $randomText ) < (4 * $walSegmentSize)) {
$randomText = $randomText . chr( int(rand(26)) + 65);
}
my $sth = $dbh->prepare(q{INSERT INTO cptest.randomdata VALUES (?)});
$sth->bind_param(1, $randomText);
$sth->execute();
$dbh->commit();

my $currentWalSize = $cp->run('--interval=15m --output=simple');
chomp($currentWalSize);
$t=qq{$S reported a positive amount of written wal files ($currentWalSize) in the last 15 minutes before commited 64MB of random data};
ok ($currentWalSize > 0, $t);

# validate if enough wal data was written
#
my $minWalSizeDelta = 3 * $walSegmentSize;
$t=qq{$S reported a minimum of more ($minWalSizeDelta) amount of written wal files ($currentWalSize) since comitted test data};
ok ($currentWalSize >= ($initialWalSize + 3 * $walSegmentSize), $t);

# take a look on the mrtg output
#
$t=qq{$S returns correct MRTG information};
is ($cp->run('--interval=15m --output=mrtg'), "$currentWalSize\n0\n\n\n", $t);

# check if the lsfunc option is working
#
my $xlogdir = $ver >= 96000 ? 'pg_wal' : 'pg_xlog';
$dbh->do(qq{CREATE OR REPLACE FUNCTION ls_xlog_dir()
RETURNS SETOF TEXT
AS \$\$ SELECT pg_ls_dir('$xlogdir') \$\$
LANGUAGE SQL
SECURITY DEFINER});
$dbh->commit();

$t=qq{$S returns correct amount of written wal files if lsfunc is used};
like ($cp->run('--interval=15m --output=simple'), qr{^$currentWalSize$}, $t);

# cleanup
#
$dbh->do(q{DROP TABLE cptest.randomdata});
$dbh->do(q{DROP FUNCTION ls_xlog_dir()});
$cp->drop_schema_if_exists();
$dbh->commit();

exit;