#!/usr/bin/perl -w # Nagios plugin to monitor CTDB (Clustered Trivial Database) # # License: GPL # Copyright (c) 2011 Nantes Metropole # Author: Mathieu Parent # Contributor(s): - # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # use strict; use warnings; use vars qw($PROGNAME $VERSION $output $values $result); use Nagios::Plugin; use File::Basename; $PROGNAME = basename($0); $VERSION = '0.4'; my $np = Nagios::Plugin->new( usage => "Usage: %s -i \n" . " [ -t ] [ -w ] [ -c ]\n" . " [ -H ] [-s] [ -l ]\n" . ' [ -V ] [ -h ]', version => $VERSION, plugin => $PROGNAME, shortname => uc($PROGNAME), blurb => 'CTDB plugin', extra => "Supported commands:\n" . " * scriptstatus :\n" . " check the ctdb scriptstatus command and return CRITICAL if one of the\n" . " scripts fails.\n" . " Perfdata count the number of scripts by state (ok, disabled, error,\n" . " total).\n" . " * ping :\n" . " check the ctdb ping command.\n" . " Perfdata count the number of nodes, the total ping time and the number\n" . " of clients.\n" . " Thresholds are checked against the number of nodes.\n" . "\n\nCopyright (c) 2011 Nantes Metropole", timeout => 30, ); $np->add_arg( spec => 'info|i=s', help => "-i, --info=\n" . ' Information: One of scriptstatus or ping.', required => 1, ); $np->add_arg( spec => 'hostname|H=s', help => "-H, --hostname=\n" . ' Host name or IP Address.', required => 0, ); $np->add_arg( spec => 'sudo|s', help => "-s, --sudo\n" . ' Use sudo.', required => 0, ); $np->add_arg( spec => 'login|l=s', help => "-l, --login=\n" . ' The user to log in as on the remote machine.', required => 0, ); $np->add_arg( spec => 'warning|w=s', help => "-w, --warning=THRESHOLD\n" . " Warning threshold. See\n" . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n" . ' for the threshold format.', required => 0, ); $np->add_arg( spec => 'critical|c=s', help => "-c, --critical=THRESHOLD\n" . " Critical threshold. See\n" . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n" . ' for the threshold format.', required => 0, ); $np->getopts; my $info = $np->opts->info; my $hostname = $np->opts->hostname; my $login = $np->opts->login; my $sudo = $np->opts->sudo; my $warning = $np->opts->warning; my $critical = $np->opts->critical; my $percw; my $percc; $output = ""; if (defined($critical)) { ($percc, $critical) = check_percantage($critical); $critical = undef if ($critical eq ''); } if (defined($warning)) { ($percw, $warning) = check_percantage($warning); $warning = undef if ($warning eq ''); } $np->set_thresholds(critical => $critical, warning => $warning); my $stderr; sub safe_open_command { unshift @_, "sudo" if $sudo; if ($hostname) { unshift @_, $hostname; unshift @_, "-l", $login if $login; unshift @_, "ssh"; } open(OLDERR, ">&", \*STDERR) or die "Can't dup STDERR: $!"; $stderr = ""; close STDERR; open(STDERR, ">>", \$stderr) or die "Can't open STDERR: $!"; if ($np->opts->verbose) { print "Executing: @_\n"; } if (!open(PIPE, '-|', @_)) { $result = CRITICAL; $output .= "Cannot open command '@_': $! ($stderr). "; # restore STDERR open(STDERR, ">", \*OLDERR) or die "Can't dup OLDERR: $!"; } } sub safe_close_command { close(PIPE); if ($? == -1) { $result = CRITICAL; $output .= "failed to execute: $!. "; } elsif ($? & 127) { $result = CRITICAL; $output .= sprintf("child died with signal %d, %s coredump. ", ($? & 127), ($? & 128) ? 'with' : 'without'); } elsif ($? >> 8) { if (($? >> 8) == 255) { # ctdb returns -1=255 if any node is disconnected $result = WARNING; $output .= sprintf("child exited with value %d. ", $? >> 8) if $output eq ""; } else { $result = CRITICAL; $output .= sprintf("child exited with value %d. ", $? >> 8); } } # restore STDERR open(STDERR, ">&OLDERR") or die "Can't dup OLDERR: $!"; } # main : if ($info eq "scriptstatus") { $result = OK; safe_open_command('ctdb', '-X', 'scriptstatus'); if ($result == OK) { my $script_count = 0; my $ok_script_count = 0; my $disabled_script_count = 0; my $error_script_count = 0; while () { next if $. == 1; # Header $script_count++; chop; my ($col0, $type, $name, $code, $status, $start, $end, @error) = split("|"); if ($col0 ne '') { # Old version, before 30 Aug 2011 and commit a779d83a6213 ($type, $name, $code, $status, $start, $end, @error) = ($col0, $type, $name, $code, $status, $start, $end, @error); } my $error = join(':', @error); if ($error ne "") { $output = "$output ;; " if $output; $output = "$output$name ($status=$code): $error "; if ($result != CRITICAL) { $result = WARNING; } } if ($status eq "OK") { $ok_script_count++; next; } if ($status eq "DISABLED") { $disabled_script_count++; next; } $error_script_count++; $result = WARNING; } safe_close_command(); $np->add_perfdata(label => "ok", value => $ok_script_count, uom => '', min => 0, max => $script_count); $np->add_perfdata(label => "disabled", value => $disabled_script_count, uom => '', min => 0, max => $script_count); $np->add_perfdata(label => "error", value => $error_script_count, uom => '', min => 0, max => $script_count, warning => '0', critical => '0'); $np->add_perfdata(label => "total", value => $script_count, uom => '', min => 0, max => $script_count); if ($result == OK) { $result = $np->check_threshold(check => $error_script_count, warning => '0', critical => '0'); } } $np->nagios_exit($result, $output); } elsif ($info eq "ping") { # Get expected nodes count $result = OK; safe_open_command('cat', '/etc/ctdb/nodes'); 1 while( ); my $max_nodes_count = $.; safe_close_command(); # ctdb ping $result = OK; safe_open_command('ctdb', '-n', 'all', 'ping'); if ($result == OK) { my $nodes_count = 0; my $time_total = 0.0; my $clients_count = 0; while () { chop; if ($_ =~ /^response from (\d+) time=([0-9.]+) sec \((\d+) clients\)$/) { my ($node_id, $time, $clients) = ($1,$2,$3); $nodes_count += 1; $time_total += $time; $clients_count += $clients; } elsif ($_ =~ /^Unable to get ping response from node (\d+)$/) { # } else { $result = CRITICAL; $output .= "'$_' doesn't match regexp. " } } $output .= sprintf("%d missing nodes. ", $max_nodes_count - $nodes_count) if $nodes_count < $max_nodes_count; safe_close_command(); $np->add_perfdata(label => "nodes", value => $nodes_count, uom => '', min => 0, max => $max_nodes_count, warning => $warning, critical => $critical); $np->add_perfdata(label => "ping_time", value => $time_total, uom => 's', min => 0, max => undef); $np->add_perfdata(label => "clients", value => $clients_count, uom => '', min => 0, max => undef); if ($result == OK) { $result = $np->check_threshold(check => $nodes_count); } } $np->nagios_exit($result, $output); } else { $np->nagios_exit(UNKNOWN, "Unknown command: '$info'"); } sub check_percantage { my ($number) = shift(@_); my $perc = $number =~ s/\%//; return ($perc, $number); }