diff options
Diffstat (limited to 'lib/icingadb/icingadbchecktask.cpp')
-rw-r--r-- | lib/icingadb/icingadbchecktask.cpp | 513 |
1 files changed, 513 insertions, 0 deletions
diff --git a/lib/icingadb/icingadbchecktask.cpp b/lib/icingadb/icingadbchecktask.cpp new file mode 100644 index 0000000..f7c5964 --- /dev/null +++ b/lib/icingadb/icingadbchecktask.cpp @@ -0,0 +1,513 @@ +/* Icinga 2 | (c) 2022 Icinga GmbH | GPLv2+ */ + +#include "icingadb/icingadbchecktask.hpp" +#include "icinga/host.hpp" +#include "icinga/checkcommand.hpp" +#include "icinga/macroprocessor.hpp" +#include "icinga/pluginutility.hpp" +#include "base/function.hpp" +#include "base/utility.hpp" +#include "base/perfdatavalue.hpp" +#include "base/convert.hpp" +#include <utility> + +using namespace icinga; + +REGISTER_FUNCTION_NONCONST(Internal, IcingadbCheck, &IcingadbCheckTask::ScriptFunc, "checkable:cr:resolvedMacros:useResolvedMacros"); + +static void ReportIcingadbCheck( + const Checkable::Ptr& checkable, const CheckCommand::Ptr& commandObj, + const CheckResult::Ptr& cr, String output, ServiceState state) +{ + if (Checkable::ExecuteCommandProcessFinishedHandler) { + double now = Utility::GetTime(); + ProcessResult pr; + pr.PID = -1; + pr.Output = std::move(output); + pr.ExecutionStart = now; + pr.ExecutionEnd = now; + pr.ExitStatus = state; + + Checkable::ExecuteCommandProcessFinishedHandler(commandObj->GetName(), pr); + } else { + cr->SetState(state); + cr->SetOutput(output); + checkable->ProcessCheckResult(cr); + } +} + +static inline +double GetXMessageTs(const Array::Ptr& xMessage) +{ + return Convert::ToLong(String(xMessage->Get(0)).Split("-")[0]) / 1000.0; +} + +void IcingadbCheckTask::ScriptFunc(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, + const Dictionary::Ptr& resolvedMacros, bool useResolvedMacros) +{ + CheckCommand::Ptr commandObj = CheckCommand::ExecuteOverride ? CheckCommand::ExecuteOverride : checkable->GetCheckCommand(); + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(checkable); + + MacroProcessor::ResolverList resolvers; + String silenceMissingMacroWarning; + + if (MacroResolver::OverrideMacros) + resolvers.emplace_back("override", MacroResolver::OverrideMacros); + + if (service) + resolvers.emplace_back("service", service); + resolvers.emplace_back("host", host); + resolvers.emplace_back("command", commandObj); + + auto resolve ([&](const String& macro) { + return MacroProcessor::ResolveMacros(macro, resolvers, checkable->GetLastCheckResult(), + &silenceMissingMacroWarning, MacroProcessor::EscapeCallback(), resolvedMacros, useResolvedMacros); + }); + + struct Thresholds + { + Value Warning, Critical; + }; + + auto resolveThresholds ([&resolve](const String& wmacro, const String& cmacro) { + return Thresholds{resolve(wmacro), resolve(cmacro)}; + }); + + String icingadbName = resolve("$icingadb_name$"); + + auto dumpTakesThresholds (resolveThresholds("$icingadb_full_dump_duration_warning$", "$icingadb_full_dump_duration_critical$")); + auto syncTakesThresholds (resolveThresholds("$icingadb_full_sync_duration_warning$", "$icingadb_full_sync_duration_critical$")); + auto icingaBacklogThresholds (resolveThresholds("$icingadb_redis_backlog_warning$", "$icingadb_redis_backlog_critical$")); + auto icingadbBacklogThresholds (resolveThresholds("$icingadb_database_backlog_warning$", "$icingadb_database_backlog_critical$")); + + if (resolvedMacros && !useResolvedMacros) + return; + + if (icingadbName.IsEmpty()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Attribute 'icingadb_name' must be set.", ServiceUnknown); + return; + } + + auto conn (IcingaDB::GetByName(icingadbName)); + + if (!conn) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB UNKNOWN: Icinga DB connection '" + icingadbName + "' does not exist.", ServiceUnknown); + return; + } + + auto redis (conn->GetConnection()); + + if (!redis || !redis->GetConnected()) { + ReportIcingadbCheck(checkable, commandObj, cr, "Icinga DB CRITICAL: Not connected to Redis.", ServiceCritical); + return; + } + + auto now (Utility::GetTime()); + Array::Ptr redisTime, xReadHeartbeat, xReadStats, xReadRuntimeBacklog, xReadHistoryBacklog; + + try { + auto replies (redis->GetResultsOfQueries( + { + {"TIME"}, + {"XREAD", "STREAMS", "icingadb:telemetry:heartbeat", "0-0"}, + {"XREAD", "STREAMS", "icingadb:telemetry:stats", "0-0"}, + {"XREAD", "COUNT", "1", "STREAMS", "icinga:runtime", "icinga:runtime:state", "0-0", "0-0"}, + { + "XREAD", "COUNT", "1", "STREAMS", + "icinga:history:stream:acknowledgement", + "icinga:history:stream:comment", + "icinga:history:stream:downtime", + "icinga:history:stream:flapping", + "icinga:history:stream:notification", + "icinga:history:stream:state", + "0-0", "0-0", "0-0", "0-0", "0-0", "0-0", + } + }, + RedisConnection::QueryPriority::Heartbeat + )); + + redisTime = std::move(replies.at(0)); + xReadHeartbeat = std::move(replies.at(1)); + xReadStats = std::move(replies.at(2)); + xReadRuntimeBacklog = std::move(replies.at(3)); + xReadHistoryBacklog = std::move(replies.at(4)); + } catch (const std::exception& ex) { + ReportIcingadbCheck( + checkable, commandObj, cr, + String("Icinga DB CRITICAL: Could not query Redis: ") + ex.what(), ServiceCritical + ); + return; + } + + if (!xReadHeartbeat) { + ReportIcingadbCheck( + checkable, commandObj, cr, + "Icinga DB CRITICAL: The Icinga DB daemon seems to have never run. (Missing heartbeat)", + ServiceCritical + ); + + return; + } + + auto redisOldestPending (redis->GetOldestPendingQueryTs()); + auto ongoingDumpStart (conn->GetOngoingDumpStart()); + auto dumpWhen (conn->GetLastdumpEnd()); + auto dumpTook (conn->GetLastdumpTook()); + + auto redisNow (Convert::ToLong(redisTime->Get(0)) + Convert::ToLong(redisTime->Get(1)) / 1000000.0); + Array::Ptr heartbeatMessage = Array::Ptr(Array::Ptr(xReadHeartbeat->Get(0))->Get(1))->Get(0); + auto heartbeatTime (GetXMessageTs(heartbeatMessage)); + std::map<String, String> heartbeatData; + + IcingaDB::AddKvsToMap(heartbeatMessage->Get(1), heartbeatData); + + String version = heartbeatData.at("version"); + auto icingadbNow (Convert::ToLong(heartbeatData.at("time")) / 1000.0 + (redisNow - heartbeatTime)); + auto icingadbStartTime (Convert::ToLong(heartbeatData.at("start-time")) / 1000.0); + String errMsg (heartbeatData.at("error")); + auto errSince (Convert::ToLong(heartbeatData.at("error-since")) / 1000.0); + String perfdataFromRedis = heartbeatData.at("performance-data"); + auto heartbeatLastReceived (Convert::ToLong(heartbeatData.at("last-heartbeat-received")) / 1000.0); + bool weResponsible = Convert::ToLong(heartbeatData.at("ha-responsible")); + auto weResponsibleTs (Convert::ToLong(heartbeatData.at("ha-responsible-ts")) / 1000.0); + bool otherResponsible = Convert::ToLong(heartbeatData.at("ha-other-responsible")); + auto syncOngoingSince (Convert::ToLong(heartbeatData.at("sync-ongoing-since")) / 1000.0); + auto syncSuccessWhen (Convert::ToLong(heartbeatData.at("sync-success-finish")) / 1000.0); + auto syncSuccessTook (Convert::ToLong(heartbeatData.at("sync-success-duration")) / 1000.0); + + std::ostringstream i2okmsgs, idbokmsgs, warnmsgs, critmsgs; + Array::Ptr perfdata = new Array(); + + i2okmsgs << std::fixed << std::setprecision(3); + idbokmsgs << std::fixed << std::setprecision(3); + warnmsgs << std::fixed << std::setprecision(3); + critmsgs << std::fixed << std::setprecision(3); + + const auto downForCritical (10); + auto downFor (redisNow - heartbeatTime); + bool down = false; + + if (downFor > downForCritical) { + down = true; + + critmsgs << " Last seen " << Utility::FormatDuration(downFor) + << " ago, greater than CRITICAL threshold (" << Utility::FormatDuration(downForCritical) << ")!"; + } else { + idbokmsgs << "\n* Last seen: " << Utility::FormatDuration(downFor) << " ago"; + } + + perfdata->Add(new PerfdataValue("icingadb_heartbeat_age", downFor, false, "seconds", Empty, downForCritical, 0)); + + const auto errForCritical (10); + auto err (!errMsg.IsEmpty()); + auto errFor (icingadbNow - errSince); + + if (err) { + if (errFor > errForCritical) { + critmsgs << " ERROR: " << errMsg << "!"; + } + + perfdata->Add(new PerfdataValue("error_for", errFor * (err ? 1 : -1), false, "seconds", Empty, errForCritical, 0)); + } + + if (!down) { + const auto heartbeatLagWarning (3/* Icinga DB read freq. */ + 1/* Icinga DB write freq. */ + 2/* threshold */); + auto heartbeatLag (fmin(icingadbNow - heartbeatLastReceived, 10 * 60)); + + if (!heartbeatLastReceived) { + critmsgs << " Lost Icinga 2 heartbeat!"; + } else if (heartbeatLag > heartbeatLagWarning) { + warnmsgs << " Icinga 2 heartbeat lag: " << Utility::FormatDuration(heartbeatLag) + << ", greater than WARNING threshold (" << Utility::FormatDuration(heartbeatLagWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("icinga2_heartbeat_age", heartbeatLag, false, "seconds", heartbeatLagWarning, Empty, 0)); + } + + if (weResponsible) { + idbokmsgs << "\n* Responsible"; + } else if (otherResponsible) { + idbokmsgs << "\n* Not responsible, but another instance is"; + } else { + critmsgs << " No instance is responsible!"; + } + + perfdata->Add(new PerfdataValue("icingadb_responsible_instances", int(weResponsible || otherResponsible), false, "", Empty, Empty, 0, 1)); + + const auto clockDriftWarning (5); + const auto clockDriftCritical (30); + auto clockDrift (std::max({ + fabs(now - redisNow), + fabs(redisNow - icingadbNow), + fabs(icingadbNow - now), + })); + + if (clockDrift > clockDriftCritical) { + critmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(clockDriftCritical) << ")!"; + } else if (clockDrift > clockDriftWarning) { + warnmsgs << " Icinga 2/Redis/Icinga DB clock drift: " << Utility::FormatDuration(clockDrift) + << ", greater than WARNING threshold (" << Utility::FormatDuration(clockDriftWarning) << ")."; + } + + perfdata->Add(new PerfdataValue("clock_drift", clockDrift, false, "seconds", clockDriftWarning, clockDriftCritical, 0)); + + if (ongoingDumpStart) { + auto ongoingDumpTakes (now - ongoingDumpStart); + + if (!dumpTakesThresholds.Critical.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Critical) { + critmsgs << " Current Icinga 2 full dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(dumpTakesThresholds.Critical) << ")!"; + } else if (!dumpTakesThresholds.Warning.IsEmpty() && ongoingDumpTakes > dumpTakesThresholds.Warning) { + warnmsgs << " Current Icinga 2 full dump already takes " << Utility::FormatDuration(ongoingDumpTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(dumpTakesThresholds.Warning) << ")."; + } else { + i2okmsgs << "\n* Current full dump running for " << Utility::FormatDuration(ongoingDumpTakes); + } + + perfdata->Add(new PerfdataValue("icinga2_current_full_dump_duration", ongoingDumpTakes, false, "seconds", + dumpTakesThresholds.Warning, dumpTakesThresholds.Critical, 0)); + } + + if (!down && syncOngoingSince) { + auto ongoingSyncTakes (icingadbNow - syncOngoingSince); + + if (!syncTakesThresholds.Critical.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Critical) { + critmsgs << " Current full sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(syncTakesThresholds.Critical) << ")!"; + } else if (!syncTakesThresholds.Warning.IsEmpty() && ongoingSyncTakes > syncTakesThresholds.Warning) { + warnmsgs << " Current full sync already takes " << Utility::FormatDuration(ongoingSyncTakes) + << ", greater than WARNING threshold (" << Utility::FormatDuration(syncTakesThresholds.Warning) << ")."; + } else { + idbokmsgs << "\n* Current full sync running for " << Utility::FormatDuration(ongoingSyncTakes); + } + + perfdata->Add(new PerfdataValue("icingadb_current_full_sync_duration", ongoingSyncTakes, false, "seconds", + syncTakesThresholds.Warning, syncTakesThresholds.Critical, 0)); + } + + auto redisBacklog (now - redisOldestPending); + + if (!redisOldestPending) { + redisBacklog = 0; + } + + if (!icingaBacklogThresholds.Critical.IsEmpty() && redisBacklog > icingaBacklogThresholds.Critical) { + critmsgs << " Icinga 2 Redis query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Critical) << ")!"; + } else if (!icingaBacklogThresholds.Warning.IsEmpty() && redisBacklog > icingaBacklogThresholds.Warning) { + warnmsgs << " Icinga 2 Redis query backlog: " << Utility::FormatDuration(redisBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingaBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("icinga2_redis_query_backlog", redisBacklog, false, "seconds", + icingaBacklogThresholds.Warning, icingaBacklogThresholds.Critical, 0)); + + if (!down) { + auto getBacklog = [redisNow](const Array::Ptr& streams) -> double { + if (!streams) { + return 0; + } + + double minTs = 0; + ObjectLock lock (streams); + + for (Array::Ptr stream : streams) { + auto ts (GetXMessageTs(Array::Ptr(stream->Get(1))->Get(0))); + + if (minTs == 0 || ts < minTs) { + minTs = ts; + } + } + + if (minTs > 0) { + return redisNow - minTs; + } else { + return 0; + } + }; + + double historyBacklog = getBacklog(xReadHistoryBacklog); + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && historyBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " History backlog: " << Utility::FormatDuration(historyBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && historyBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " History backlog: " << Utility::FormatDuration(historyBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; + } + + perfdata->Add(new PerfdataValue("icingadb_history_backlog", historyBacklog, false, "seconds", + icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); + + double runtimeBacklog = 0; + + if (weResponsible && !syncOngoingSince) { + // These streams are only processed by the responsible instance after the full sync finished, + // it's fine for some backlog to exist otherwise. + runtimeBacklog = getBacklog(xReadRuntimeBacklog); + + if (!icingadbBacklogThresholds.Critical.IsEmpty() && runtimeBacklog > icingadbBacklogThresholds.Critical) { + critmsgs << " Runtime update backlog: " << Utility::FormatDuration(runtimeBacklog) + << ", greater than CRITICAL threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Critical) << ")!"; + } else if (!icingadbBacklogThresholds.Warning.IsEmpty() && runtimeBacklog > icingadbBacklogThresholds.Warning) { + warnmsgs << " Runtime update backlog: " << Utility::FormatDuration(runtimeBacklog) + << ", greater than WARNING threshold (" << Utility::FormatDuration(icingadbBacklogThresholds.Warning) << ")."; + } + } + + // Also report the perfdata value on the standby instance or during a full sync (as 0 in this case). + perfdata->Add(new PerfdataValue("icingadb_runtime_update_backlog", runtimeBacklog, false, "seconds", + icingadbBacklogThresholds.Warning, icingadbBacklogThresholds.Critical, 0)); + } + + auto dumpAgo (now - dumpWhen); + + if (dumpWhen) { + perfdata->Add(new PerfdataValue("icinga2_last_full_dump_ago", dumpAgo, false, "seconds", Empty, Empty, 0)); + } + + if (dumpTook) { + perfdata->Add(new PerfdataValue("icinga2_last_full_dump_duration", dumpTook, false, "seconds", Empty, Empty, 0)); + } + + if (dumpWhen && dumpTook) { + i2okmsgs << "\n* Last full dump: " << Utility::FormatDuration(dumpAgo) + << " ago, took " << Utility::FormatDuration(dumpTook); + } + + auto icingadbUptime (icingadbNow - icingadbStartTime); + + if (!down) { + perfdata->Add(new PerfdataValue("icingadb_uptime", icingadbUptime, false, "seconds", Empty, Empty, 0)); + } + + { + Array::Ptr values = PluginUtility::SplitPerfdata(perfdataFromRedis); + ObjectLock lock (values); + + for (auto& v : values) { + perfdata->Add(PerfdataValue::Parse(v)); + } + } + + if (weResponsibleTs) { + perfdata->Add(new PerfdataValue("icingadb_responsible_for", + (weResponsible ? 1 : -1) * (icingadbNow - weResponsibleTs), false, "seconds")); + } + + auto syncAgo (icingadbNow - syncSuccessWhen); + + if (syncSuccessWhen) { + perfdata->Add(new PerfdataValue("icingadb_last_full_sync_ago", syncAgo, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessTook) { + perfdata->Add(new PerfdataValue("icingadb_last_full_sync_duration", syncSuccessTook, false, "seconds", Empty, Empty, 0)); + } + + if (syncSuccessWhen && syncSuccessTook) { + idbokmsgs << "\n* Last full sync: " << Utility::FormatDuration(syncAgo) + << " ago, took " << Utility::FormatDuration(syncSuccessTook); + } + + std::map<String, RingBuffer> statsPerOp; + + const char * const icingadbKnownStats[] = { + "config_sync", "state_sync", "history_sync", "overdue_sync", "history_cleanup" + }; + + for (auto metric : icingadbKnownStats) { + statsPerOp.emplace(std::piecewise_construct, std::forward_as_tuple(metric), std::forward_as_tuple(15 * 60)); + } + + if (xReadStats) { + Array::Ptr messages = Array::Ptr(xReadStats->Get(0))->Get(1); + ObjectLock lock (messages); + + for (Array::Ptr message : messages) { + auto ts (GetXMessageTs(message)); + std::map<String, String> opsPerSec; + + IcingaDB::AddKvsToMap(message->Get(1), opsPerSec); + + for (auto& kv : opsPerSec) { + auto buf (statsPerOp.find(kv.first)); + + if (buf == statsPerOp.end()) { + buf = statsPerOp.emplace( + std::piecewise_construct, + std::forward_as_tuple(kv.first), std::forward_as_tuple(15 * 60) + ).first; + } + + buf->second.InsertValue(ts, Convert::ToLong(kv.second)); + } + } + } + + for (auto& kv : statsPerOp) { + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_1min", kv.second.UpdateAndGetValues(now, 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_5mins", kv.second.UpdateAndGetValues(now, 5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icingadb_" + kv.first + "_items_15mins", kv.second.UpdateAndGetValues(now, 15 * 60), false, "", Empty, Empty, 0)); + } + + perfdata->Add(new PerfdataValue("icinga2_redis_queries_1min", redis->GetQueryCount(60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_queries_5mins", redis->GetQueryCount(5 * 60), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue("icinga2_redis_queries_15mins", redis->GetQueryCount(15 * 60), false, "", Empty, Empty, 0)); + + perfdata->Add(new PerfdataValue("icinga2_redis_pending_queries", redis->GetPendingQueryCount(), false, "", Empty, Empty, 0)); + + struct { + const char * Name; + int (RedisConnection::* Getter)(RingBuffer::SizeType span, RingBuffer::SizeType tv); + } const icingaWriteSubjects[] = { + {"config_dump", &RedisConnection::GetWrittenConfigFor}, + {"state_dump", &RedisConnection::GetWrittenStateFor}, + {"history_dump", &RedisConnection::GetWrittenHistoryFor} + }; + + for (auto subject : icingaWriteSubjects) { + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_1min", (redis.get()->*subject.Getter)(60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_5mins", (redis.get()->*subject.Getter)(5 * 60, now), false, "", Empty, Empty, 0)); + perfdata->Add(new PerfdataValue(String("icinga2_") + subject.Name + "_items_15mins", (redis.get()->*subject.Getter)(15 * 60, now), false, "", Empty, Empty, 0)); + } + + ServiceState state; + std::ostringstream msgbuf; + auto i2okmsg (i2okmsgs.str()); + auto idbokmsg (idbokmsgs.str()); + auto warnmsg (warnmsgs.str()); + auto critmsg (critmsgs.str()); + + msgbuf << "Icinga DB "; + + if (!critmsg.empty()) { + state = ServiceCritical; + msgbuf << "CRITICAL:" << critmsg; + + if (!warnmsg.empty()) { + msgbuf << "\n\nWARNING:" << warnmsg; + } + } else if (!warnmsg.empty()) { + state = ServiceWarning; + msgbuf << "WARNING:" << warnmsg; + } else { + state = ServiceOK; + msgbuf << "OK: Uptime: " << Utility::FormatDuration(icingadbUptime) << ". Version: " << version << "."; + } + + if (!i2okmsg.empty()) { + msgbuf << "\n\nIcinga 2:\n" << i2okmsg; + } + + if (!idbokmsg.empty()) { + msgbuf << "\n\nIcinga DB:\n" << idbokmsg; + } + + cr->SetPerformanceData(perfdata); + ReportIcingadbCheck(checkable, commandObj, cr, msgbuf.str(), state); +} |