summaryrefslogtreecommitdiffstats
path: root/ctdb/utils/nagios
diff options
context:
space:
mode:
Diffstat (limited to 'ctdb/utils/nagios')
-rw-r--r--ctdb/utils/nagios/README56
-rwxr-xr-xctdb/utils/nagios/check_ctdb279
2 files changed, 335 insertions, 0 deletions
diff --git a/ctdb/utils/nagios/README b/ctdb/utils/nagios/README
new file mode 100644
index 0000000..99fa6dc
--- /dev/null
+++ b/ctdb/utils/nagios/README
@@ -0,0 +1,56 @@
+check_ctdb 0.3
+
+This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY.
+It may be used, redistributed and/or modified under the terms of the GNU
+General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
+
+CTDB plugin
+
+Usage: check_ctdb -i <info>
+ [ -t <timeout> ] [ -w <warn_range> ] [ -c <crit_range> ]
+ [ -H <host> ] [-s] [ -l <login_name> ]
+ [ -V ] [ -h ]
+
+ -?, --usage
+ Print usage information
+ -h, --help
+ Print detailed help screen
+ -V, --version
+ Print version information
+ --extra-opts=[section][@file]
+ Read options from an ini file. See http://nagiosplugins.org/extra-opts for usage
+ -i, --info=<info>
+ Information: One of scriptstatus or ping.
+ -H, --hostname=<login_name>
+ Host name or IP Address.
+ -s, --sudo
+ Use sudo.
+ -l, --login=<host>
+ The user to log in as on the remote machine.
+ -w, --warning=THRESHOLD
+ Warning threshold. See
+ http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
+ for the threshold format.
+ -c, --critical=THRESHOLD
+ Critical threshold. See
+ http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
+ for the threshold format.
+ -t, --timeout=INTEGER
+ Seconds before plugin times out (default: 30)
+ -v, --verbose
+ Show details for command-line debugging (can repeat up to 3 times)
+Supported commands:
+ * scriptstatus :
+ check the ctdb scriptstatus command and return CRITICAL if one of the
+ scripts fails.
+ Perfdata count the number of scripts by state (ok, disabled, error,
+ total).
+ * ping :
+ check the ctdb ping command.
+ Perfdata count the number of nodes, the total ping time and the number
+ of clients.
+ Thresholds are checked against the number of nodes.
+
+
+Copyright (c) 2011 Nantes Metropole
+
diff --git a/ctdb/utils/nagios/check_ctdb b/ctdb/utils/nagios/check_ctdb
new file mode 100755
index 0000000..7803f9a
--- /dev/null
+++ b/ctdb/utils/nagios/check_ctdb
@@ -0,0 +1,279 @@
+#!/usr/bin/perl -w
+# Nagios plugin to monitor CTDB (Clustered Trivial Database)
+#
+# License: GPL
+# Copyright (c) 2011 Nantes Metropole
+# Author: Mathieu Parent <math.parent@gmail.com>
+# Contributor(s): -
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+use strict;
+use warnings;
+use vars qw($PROGNAME $VERSION $output $values $result);
+use Nagios::Plugin;
+use File::Basename;
+
+$PROGNAME = basename($0);
+$VERSION = '0.4';
+
+my $np = Nagios::Plugin->new(
+ usage => "Usage: %s -i <info>\n"
+ . " [ -t <timeout> ] [ -w <warn_range> ] [ -c <crit_range> ]\n"
+ . " [ -H <host> ] [-s] [ -l <login_name> ]\n"
+ . ' [ -V ] [ -h ]',
+ version => $VERSION,
+ plugin => $PROGNAME,
+ shortname => uc($PROGNAME),
+ blurb => 'CTDB plugin',
+ extra => "Supported commands:\n"
+ . " * scriptstatus :\n"
+ . " check the ctdb scriptstatus command and return CRITICAL if one of the\n"
+ . " scripts fails.\n"
+ . " Perfdata count the number of scripts by state (ok, disabled, error,\n"
+ . " total).\n"
+ . " * ping :\n"
+ . " check the ctdb ping command.\n"
+ . " Perfdata count the number of nodes, the total ping time and the number\n"
+ . " of clients.\n"
+ . " Thresholds are checked against the number of nodes.\n"
+ . "\n\nCopyright (c) 2011 Nantes Metropole",
+ timeout => 30,
+);
+
+$np->add_arg(
+ spec => 'info|i=s',
+ help => "-i, --info=<info>\n"
+ . ' Information: One of scriptstatus or ping.',
+ required => 1,
+);
+
+$np->add_arg(
+ spec => 'hostname|H=s',
+ help => "-H, --hostname=<login_name>\n"
+ . ' Host name or IP Address.',
+ required => 0,
+);
+
+$np->add_arg(
+ spec => 'sudo|s',
+ help => "-s, --sudo\n"
+ . ' Use sudo.',
+ required => 0,
+);
+
+$np->add_arg(
+ spec => 'login|l=s',
+ help => "-l, --login=<host>\n"
+ . ' The user to log in as on the remote machine.',
+ required => 0,
+);
+
+$np->add_arg(
+ spec => 'warning|w=s',
+ help => "-w, --warning=THRESHOLD\n"
+ . " Warning threshold. See\n"
+ . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
+ . ' for the threshold format.',
+ required => 0,
+);
+
+$np->add_arg(
+ spec => 'critical|c=s',
+ help => "-c, --critical=THRESHOLD\n"
+ . " Critical threshold. See\n"
+ . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
+ . ' for the threshold format.',
+ required => 0,
+);
+
+$np->getopts;
+
+my $info = $np->opts->info;
+my $hostname = $np->opts->hostname;
+my $login = $np->opts->login;
+my $sudo = $np->opts->sudo;
+my $warning = $np->opts->warning;
+my $critical = $np->opts->critical;
+my $percw;
+my $percc;
+
+$output = "";
+
+if (defined($critical))
+{
+ ($percc, $critical) = check_percantage($critical);
+ $critical = undef if ($critical eq '');
+}
+
+if (defined($warning))
+{
+ ($percw, $warning) = check_percantage($warning);
+ $warning = undef if ($warning eq '');
+}
+
+$np->set_thresholds(critical => $critical, warning => $warning);
+
+my $stderr;
+
+sub safe_open_command {
+ unshift @_, "sudo" if $sudo;
+ if ($hostname) {
+ unshift @_, $hostname;
+ unshift @_, "-l", $login if $login;
+ unshift @_, "ssh";
+ }
+ open(OLDERR, ">&", \*STDERR) or die "Can't dup STDERR: $!";
+ $stderr = "";
+ close STDERR;
+ open(STDERR, ">>", \$stderr) or die "Can't open STDERR: $!";
+ if ($np->opts->verbose) {
+ print "Executing: @_\n";
+ }
+ if (!open(PIPE, '-|', @_)) {
+ $result = CRITICAL;
+ $output .= "Cannot open command '@_': $! ($stderr). ";
+ # restore STDERR
+ open(STDERR, ">", \*OLDERR) or die "Can't dup OLDERR: $!";
+ }
+}
+
+sub safe_close_command {
+ close(PIPE);
+
+ if ($? == -1) {
+ $result = CRITICAL;
+ $output .= "failed to execute: $!. ";
+ } elsif ($? & 127) {
+ $result = CRITICAL;
+ $output .= sprintf("child died with signal %d, %s coredump. ",
+ ($? & 127), ($? & 128) ? 'with' : 'without');
+ } elsif ($? >> 8) {
+ if (($? >> 8) == 255) {
+ # ctdb returns -1=255 if any node is disconnected
+ $result = WARNING;
+ $output .= sprintf("child exited with value %d. ", $? >> 8) if $output eq "";
+ } else {
+ $result = CRITICAL;
+ $output .= sprintf("child exited with value %d. ", $? >> 8);
+ }
+ }
+ # restore STDERR
+ open(STDERR, ">&OLDERR") or die "Can't dup OLDERR: $!";
+}
+
+# main :
+
+if ($info eq "scriptstatus") {
+ $result = OK;
+ safe_open_command('ctdb', '-X', 'scriptstatus');
+ if ($result == OK) {
+ my $script_count = 0;
+ my $ok_script_count = 0;
+ my $disabled_script_count = 0;
+ my $error_script_count = 0;
+ while (<PIPE>) {
+ next if $. == 1; # Header
+ $script_count++;
+ chop;
+ my ($col0, $type, $name, $code, $status, $start, $end, @error) = split("|");
+ if ($col0 ne '') {
+ # Old version, before 30 Aug 2011 and commit a779d83a6213
+ ($type, $name, $code, $status, $start, $end, @error) = ($col0, $type, $name, $code, $status, $start, $end, @error);
+ }
+ my $error = join(':', @error);
+ if ($error ne "") {
+ $output = "$output ;; " if $output;
+ $output = "$output$name ($status=$code): $error ";
+ if ($result != CRITICAL) {
+ $result = WARNING;
+ }
+ }
+ if ($status eq "OK") {
+ $ok_script_count++;
+ next;
+ }
+ if ($status eq "DISABLED") {
+ $disabled_script_count++;
+ next;
+ }
+ $error_script_count++;
+ $result = WARNING;
+ }
+ safe_close_command();
+ $np->add_perfdata(label => "ok", value => $ok_script_count, uom => '',
+ min => 0, max => $script_count);
+ $np->add_perfdata(label => "disabled", value => $disabled_script_count, uom => '',
+ min => 0, max => $script_count);
+ $np->add_perfdata(label => "error", value => $error_script_count, uom => '',
+ min => 0, max => $script_count, warning => '0', critical => '0');
+ $np->add_perfdata(label => "total", value => $script_count, uom => '',
+ min => 0, max => $script_count);
+ if ($result == OK) {
+ $result = $np->check_threshold(check => $error_script_count, warning => '0', critical => '0');
+ }
+ }
+ $np->nagios_exit($result, $output);
+} elsif ($info eq "ping") {
+ # Get expected nodes count
+ $result = OK;
+ safe_open_command('cat', '/etc/ctdb/nodes');
+ 1 while( <PIPE> );
+ my $max_nodes_count = $.;
+ safe_close_command();
+ # ctdb ping
+ $result = OK;
+ safe_open_command('ctdb', '-n', 'all', 'ping');
+ if ($result == OK) {
+ my $nodes_count = 0;
+ my $time_total = 0.0;
+ my $clients_count = 0;
+ while (<PIPE>) {
+ chop;
+ if ($_ =~ /^response from (\d+) time=([0-9.]+) sec \((\d+) clients\)$/) {
+ my ($node_id, $time, $clients) = ($1,$2,$3);
+ $nodes_count += 1;
+ $time_total += $time;
+ $clients_count += $clients;
+ } elsif ($_ =~ /^Unable to get ping response from node (\d+)$/) {
+ #
+ } else {
+ $result = CRITICAL;
+ $output .= "'$_' doesn't match regexp. "
+ }
+ }
+ $output .= sprintf("%d missing nodes. ", $max_nodes_count - $nodes_count) if $nodes_count < $max_nodes_count;
+ safe_close_command();
+ $np->add_perfdata(label => "nodes", value => $nodes_count, uom => '',
+ min => 0, max => $max_nodes_count, warning => $warning, critical => $critical);
+ $np->add_perfdata(label => "ping_time", value => $time_total, uom => 's',
+ min => 0, max => undef);
+ $np->add_perfdata(label => "clients", value => $clients_count, uom => '',
+ min => 0, max => undef);
+ if ($result == OK) {
+ $result = $np->check_threshold(check => $nodes_count);
+ }
+ }
+ $np->nagios_exit($result, $output);
+} else {
+ $np->nagios_exit(UNKNOWN, "Unknown command: '$info'");
+}
+
+sub check_percantage
+{
+ my ($number) = shift(@_);
+ my $perc = $number =~ s/\%//;
+ return ($perc, $number);
+}
+