diff options
Diffstat (limited to '')
-rw-r--r-- | lib/icinga/checkable-check.cpp | 709 |
1 files changed, 709 insertions, 0 deletions
diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp new file mode 100644 index 0000000..efa9477 --- /dev/null +++ b/lib/icinga/checkable-check.cpp @@ -0,0 +1,709 @@ +/* Icinga 2 | (c) 2012 Icinga GmbH | GPLv2+ */ + +#include "icinga/checkable.hpp" +#include "icinga/service.hpp" +#include "icinga/host.hpp" +#include "icinga/checkcommand.hpp" +#include "icinga/icingaapplication.hpp" +#include "icinga/cib.hpp" +#include "icinga/clusterevents.hpp" +#include "remote/messageorigin.hpp" +#include "remote/apilistener.hpp" +#include "base/objectlock.hpp" +#include "base/logger.hpp" +#include "base/convert.hpp" +#include "base/utility.hpp" +#include "base/context.hpp" + +using namespace icinga; + +boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, const MessageOrigin::Ptr&)> Checkable::OnNewCheckResult; +boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange; +boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged; +boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested; +boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated; + +Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0); + +std::mutex Checkable::m_StatsMutex; +int Checkable::m_PendingChecks = 0; +std::condition_variable Checkable::m_PendingChecksCV; + +CheckCommand::Ptr Checkable::GetCheckCommand() const +{ + return dynamic_pointer_cast<CheckCommand>(NavigateCheckCommandRaw()); +} + +TimePeriod::Ptr Checkable::GetCheckPeriod() const +{ + return TimePeriod::GetByName(GetCheckPeriodRaw()); +} + +void Checkable::SetSchedulingOffset(long offset) +{ + m_SchedulingOffset = offset; +} + +long Checkable::GetSchedulingOffset() +{ + return m_SchedulingOffset; +} + +void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) +{ + double interval; + + if (GetStateType() == StateTypeSoft && GetLastCheckResult() != nullptr) + interval = GetRetryInterval(); + else + interval = GetCheckInterval(); + + double now = Utility::GetTime(); + double adj = 0; + + if (interval > 1) + adj = fmod(now * 100 + GetSchedulingOffset(), interval * 100) / 100.0; + + if (adj != 0.0) + adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj); + + double nextCheck = now - adj + interval; + double lastCheck = GetLastCheck(); + + Log(LogDebug, "Checkable") + << "Update checkable '" << GetName() << "' with check interval '" << GetCheckInterval() + << "' from last check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", (lastCheck < 0 ? 0 : lastCheck)) + << " (" << GetLastCheck() << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ")."; + + SetNextCheck(nextCheck, false, origin); +} + +bool Checkable::HasBeenChecked() const +{ + return GetLastCheckResult() != nullptr; +} + +double Checkable::GetLastCheck() const +{ + CheckResult::Ptr cr = GetLastCheckResult(); + double schedule_end = -1; + + if (cr) + schedule_end = cr->GetScheduleEnd(); + + return schedule_end; +} + +Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin) +{ + using Result = Checkable::ProcessingResult; + + { + ObjectLock olock(this); + m_CheckRunning = false; + } + + if (!cr) + return Result::NoCheckResult; + + double now = Utility::GetTime(); + + if (cr->GetScheduleStart() == 0) + cr->SetScheduleStart(now); + + if (cr->GetScheduleEnd() == 0) + cr->SetScheduleEnd(now); + + if (cr->GetExecutionStart() == 0) + cr->SetExecutionStart(now); + + if (cr->GetExecutionEnd() == 0) + cr->SetExecutionEnd(now); + + if (!origin || origin->IsLocal()) + cr->SetSchedulingSource(IcingaApplication::GetInstance()->GetNodeName()); + + Endpoint::Ptr command_endpoint = GetCommandEndpoint(); + + if (cr->GetCheckSource().IsEmpty()) { + if ((!origin || origin->IsLocal())) + cr->SetCheckSource(IcingaApplication::GetInstance()->GetNodeName()); + + /* override check source if command_endpoint was defined */ + if (command_endpoint && !GetExtension("agent_check")) + cr->SetCheckSource(command_endpoint->GetName()); + } + + /* agent checks go through the api */ + if (command_endpoint && GetExtension("agent_check")) { + ApiListener::Ptr listener = ApiListener::GetInstance(); + + if (listener) { + /* send message back to its origin */ + Dictionary::Ptr message = ClusterEvents::MakeCheckResultMessage(this, cr); + listener->SyncSendMessage(command_endpoint, message); + } + + return Result::Ok; + + } + + if (!IsActive()) + return Result::CheckableInactive; + + bool reachable = IsReachable(); + bool notification_reachable = IsReachable(DependencyNotification); + + ObjectLock olock(this); + + CheckResult::Ptr old_cr = GetLastCheckResult(); + ServiceState old_state = GetStateRaw(); + StateType old_stateType = GetStateType(); + long old_attempt = GetCheckAttempt(); + bool recovery = false; + + /* When we have an check result already (not after fresh start), + * prevent to accept old check results and allow overrides for + * CRs happened in the future. + */ + if (old_cr) { + double currentCRTimestamp = old_cr->GetExecutionStart(); + double newCRTimestamp = cr->GetExecutionStart(); + + /* Our current timestamp may be from the future (wrong server time adjusted again). Allow overrides here. */ + if (currentCRTimestamp > now) { + /* our current CR is from the future, let the new CR override it. */ + Log(LogDebug, "Checkable") + << std::fixed << std::setprecision(6) << "Processing check result for checkable '" << GetName() << "' from " + << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp + << "). Overriding since ours is from the future at " + << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ")."; + } else { + /* Current timestamp is from the past, but the new timestamp is even more in the past. Skip it. */ + if (newCRTimestamp < currentCRTimestamp) { + Log(LogDebug, "Checkable") + << std::fixed << std::setprecision(6) << "Skipping check result for checkable '" << GetName() << "' from " + << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", newCRTimestamp) << " (" << newCRTimestamp + << "). It is in the past compared to ours at " + << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", currentCRTimestamp) << " (" << currentCRTimestamp << ")."; + return Result::NewerCheckResultPresent; + } + } + } + + /* The ExecuteCheck function already sets the old state, but we need to do it again + * in case this was a passive check result. */ + SetLastStateRaw(old_state); + SetLastStateType(old_stateType); + SetLastReachable(reachable); + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(this); + + CheckableType checkableType = CheckableHost; + if (service) + checkableType = CheckableService; + + long attempt = 1; + + std::set<Checkable::Ptr> children = GetChildren(); + + if (IsStateOK(cr->GetState())) { + SetStateType(StateTypeHard); // NOT-OK -> HARD OK + + if (!IsStateOK(old_state)) + recovery = true; + + ResetNotificationNumbers(); + SaveLastState(ServiceOK, cr->GetExecutionEnd()); + } else { + /* OK -> NOT-OK change, first SOFT state. Reset attempt counter. */ + if (IsStateOK(old_state)) { + SetStateType(StateTypeSoft); + attempt = 1; + } + + /* SOFT state change, increase attempt counter. */ + if (old_stateType == StateTypeSoft && !IsStateOK(old_state)) { + SetStateType(StateTypeSoft); + attempt = old_attempt + 1; + } + + /* HARD state change (e.g. previously 2/3 and this next attempt). Reset attempt counter. */ + if (attempt >= GetMaxCheckAttempts()) { + SetStateType(StateTypeHard); + attempt = 1; + } + + if (!IsStateOK(cr->GetState())) { + SaveLastState(cr->GetState(), cr->GetExecutionEnd()); + } + } + + if (!reachable) + SetLastStateUnreachable(cr->GetExecutionEnd()); + + SetCheckAttempt(attempt); + + ServiceState new_state = cr->GetState(); + SetStateRaw(new_state); + + bool stateChange; + + /* Exception on state change calculation for hosts. */ + if (checkableType == CheckableService) + stateChange = (old_state != new_state); + else + stateChange = (Host::CalculateState(old_state) != Host::CalculateState(new_state)); + + /* Store the current last state change for the next iteration. */ + SetPreviousStateChange(GetLastStateChange()); + + if (stateChange) { + SetLastStateChange(cr->GetExecutionEnd()); + + /* remove acknowledgements */ + if (GetAcknowledgement() == AcknowledgementNormal || + (GetAcknowledgement() == AcknowledgementSticky && IsStateOK(new_state))) { + ClearAcknowledgement(""); + } + } + + bool remove_acknowledgement_comments = false; + + if (GetAcknowledgement() == AcknowledgementNone) + remove_acknowledgement_comments = true; + + bool hardChange = (GetStateType() == StateTypeHard && old_stateType == StateTypeSoft); + + if (stateChange && old_stateType == StateTypeHard && GetStateType() == StateTypeHard) + hardChange = true; + + bool is_volatile = GetVolatile(); + + if (hardChange || is_volatile) { + SetLastHardStateRaw(new_state); + SetLastHardStateChange(cr->GetExecutionEnd()); + SetLastHardStatesRaw(GetLastHardStatesRaw() / 100u + new_state * 100u); + } + + if (stateChange) { + SetLastSoftStatesRaw(GetLastSoftStatesRaw() / 100u + new_state * 100u); + } + + cr->SetPreviousHardState(ServiceState(GetLastHardStatesRaw() % 100u)); + + if (!IsStateOK(new_state)) + TriggerDowntimes(cr->GetExecutionEnd()); + + /* statistics for external tools */ + Checkable::UpdateStatistics(cr, checkableType); + + bool in_downtime = IsInDowntime(); + + bool send_notification = false; + bool suppress_notification = !notification_reachable || in_downtime || IsAcknowledged(); + + /* Send notifications whether when a hard state change occurred. */ + if (hardChange && !(old_stateType == StateTypeSoft && IsStateOK(new_state))) + send_notification = true; + /* Or if the checkable is volatile and in a HARD state. */ + else if (is_volatile && GetStateType() == StateTypeHard) + send_notification = true; + + if (IsStateOK(old_state) && old_stateType == StateTypeSoft) + send_notification = false; /* Don't send notifications for SOFT-OK -> HARD-OK. */ + + if (is_volatile && IsStateOK(old_state) && IsStateOK(new_state)) + send_notification = false; /* Don't send notifications for volatile OK -> OK changes. */ + + olock.Unlock(); + + if (remove_acknowledgement_comments) + RemoveAckComments(String(), cr->GetExecutionEnd()); + + Dictionary::Ptr vars_after = new Dictionary({ + { "state", new_state }, + { "state_type", GetStateType() }, + { "attempt", GetCheckAttempt() }, + { "reachable", reachable } + }); + + if (old_cr) + cr->SetVarsBefore(old_cr->GetVarsAfter()); + + cr->SetVarsAfter(vars_after); + + olock.Lock(); + + if (service) { + SetLastCheckResult(cr); + } else { + bool wasProblem = GetProblem(); + + SetLastCheckResult(cr); + + if (GetProblem() != wasProblem) { + auto services = host->GetServices(); + olock.Unlock(); + for (auto& service : services) { + Service::OnHostProblemChanged(service, cr, origin); + } + olock.Lock(); + } + } + + bool was_flapping = IsFlapping(); + + UpdateFlappingStatus(cr->GetState()); + + bool is_flapping = IsFlapping(); + + if (cr->GetActive()) { + UpdateNextCheck(origin); + } else { + /* Reschedule the next check for external passive check results. The side effect of + * this is that for as long as we receive results for a service we + * won't execute any active checks. */ + double offset; + double ttl = cr->GetTtl(); + + if (ttl > 0) + offset = ttl; + else + offset = GetCheckInterval(); + + SetNextCheck(Utility::GetTime() + offset, false, origin); + } + + olock.Unlock(); + +#ifdef I2_DEBUG /* I2_DEBUG */ + Log(LogDebug, "Checkable") + << "Flapping: Checkable " << GetName() + << " was: " << was_flapping + << " is: " << is_flapping + << " threshold low: " << GetFlappingThresholdLow() + << " threshold high: " << GetFlappingThresholdHigh() + << "% current: " << GetFlappingCurrent() << "%."; +#endif /* I2_DEBUG */ + + if (recovery) { + for (auto& child : children) { + if (child->GetProblem() && child->GetEnableActiveChecks()) { + auto nextCheck (now + Utility::Random() % 60); + + ObjectLock oLock (child); + + if (nextCheck < child->GetNextCheck()) { + child->SetNextCheck(nextCheck); + } + } + } + } + + if (stateChange) { + /* reschedule direct parents */ + for (const Checkable::Ptr& parent : GetParents()) { + if (parent.get() == this) + continue; + + if (!parent->GetEnableActiveChecks()) + continue; + + if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) { + ObjectLock olock(parent); + parent->SetNextCheck(now); + } + } + } + + OnNewCheckResult(this, cr, origin); + + /* signal status updates to for example db_ido */ + OnStateChanged(this); + + String old_state_str = (service ? Service::StateToString(old_state) : Host::StateToString(Host::CalculateState(old_state))); + String new_state_str = (service ? Service::StateToString(new_state) : Host::StateToString(Host::CalculateState(new_state))); + + /* Whether a hard state change or a volatile state change except OK -> OK happened. */ + if (hardChange || (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) { + OnStateChange(this, cr, StateTypeHard, origin); + Log(LogNotice, "Checkable") + << "State Change: Checkable '" << GetName() << "' hard state change from " << old_state_str << " to " << new_state_str << " detected." << (is_volatile ? " Checkable is volatile." : ""); + } + /* Whether a state change happened or the state type is SOFT (must be logged too). */ + else if (stateChange || GetStateType() == StateTypeSoft) { + OnStateChange(this, cr, StateTypeSoft, origin); + Log(LogNotice, "Checkable") + << "State Change: Checkable '" << GetName() << "' soft state change from " << old_state_str << " to " << new_state_str << " detected."; + } + + if (GetStateType() == StateTypeSoft || hardChange || recovery || + (is_volatile && !(IsStateOK(old_state) && IsStateOK(new_state)))) + ExecuteEventHandler(); + + int suppressed_types = 0; + + /* Flapping start/end notifications */ + if (!was_flapping && is_flapping) { + /* FlappingStart notifications happen on state changes, not in downtimes */ + if (!IsPaused()) { + if (in_downtime) { + suppressed_types |= NotificationFlappingStart; + } else { + OnNotificationsRequested(this, NotificationFlappingStart, cr, "", "", nullptr); + } + } + + Log(LogNotice, "Checkable") + << "Flapping Start: Checkable '" << GetName() << "' started flapping (Current flapping value " + << GetFlappingCurrent() << "% > high threshold " << GetFlappingThresholdHigh() << "%)."; + + NotifyFlapping(origin); + } else if (was_flapping && !is_flapping) { + /* FlappingEnd notifications are independent from state changes, must not happen in downtine */ + if (!IsPaused()) { + if (in_downtime) { + suppressed_types |= NotificationFlappingEnd; + } else { + OnNotificationsRequested(this, NotificationFlappingEnd, cr, "", "", nullptr); + } + } + + Log(LogNotice, "Checkable") + << "Flapping Stop: Checkable '" << GetName() << "' stopped flapping (Current flapping value " + << GetFlappingCurrent() << "% < low threshold " << GetFlappingThresholdLow() << "%)."; + + NotifyFlapping(origin); + } + + if (send_notification && !is_flapping) { + if (!IsPaused()) { + /* If there are still some pending suppressed state notification, keep the suppression until these are + * handled by Checkable::FireSuppressedNotifications(). + */ + bool pending = GetSuppressedNotifications() & (NotificationRecovery|NotificationProblem); + + if (suppress_notification || pending) { + suppressed_types |= (recovery ? NotificationRecovery : NotificationProblem); + } else { + OnNotificationsRequested(this, recovery ? NotificationRecovery : NotificationProblem, cr, "", "", nullptr); + } + } + } + + if (suppressed_types) { + /* If some notifications were suppressed, but just because of e.g. a downtime, + * stash them into a notification types bitmask for maybe re-sending later. + */ + + ObjectLock olock (this); + int suppressed_types_before (GetSuppressedNotifications()); + int suppressed_types_after (suppressed_types_before | suppressed_types); + + const int conflict = NotificationFlappingStart | NotificationFlappingEnd; + if ((suppressed_types_after & conflict) == conflict) { + /* Flapping start and end cancel out each other. */ + suppressed_types_after &= ~conflict; + } + + const int stateNotifications = NotificationRecovery | NotificationProblem; + if (!(suppressed_types_before & stateNotifications) && (suppressed_types & stateNotifications)) { + /* A state-related notification is suppressed for the first time, store the previous state. When + * notifications are no longer suppressed, this can be compared with the current state to determine + * if a notification must be sent. This is done differently compared to flapping notifications just above + * as for state notifications, problem and recovery don't always cancel each other. For example, + * WARNING -> OK -> CRITICAL generates both types once, but there should still be a notification. + */ + SetStateBeforeSuppression(old_stateType == StateTypeHard ? old_state : ServiceOK); + } + + if (suppressed_types_after != suppressed_types_before) { + SetSuppressedNotifications(suppressed_types_after); + } + } + + /* update reachability for child objects */ + if ((stateChange || hardChange) && !children.empty()) + OnReachabilityChanged(this, cr, children, origin); + + return Result::Ok; +} + +void Checkable::ExecuteRemoteCheck(const Dictionary::Ptr& resolvedMacros) +{ + CONTEXT("Executing remote check for object '" << GetName() << "'"); + + double scheduled_start = GetNextCheck(); + double before_check = Utility::GetTime(); + + CheckResult::Ptr cr = new CheckResult(); + cr->SetScheduleStart(scheduled_start); + cr->SetExecutionStart(before_check); + + GetCheckCommand()->Execute(this, cr, resolvedMacros, true); +} + +void Checkable::ExecuteCheck() +{ + CONTEXT("Executing check for object '" << GetName() << "'"); + + /* keep track of scheduling info in case the check type doesn't provide its own information */ + double scheduled_start = GetNextCheck(); + double before_check = Utility::GetTime(); + + SetLastCheckStarted(Utility::GetTime()); + + /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending + * queues and ensures that checks are not fired multiple times. ProcessCheckResult() + * is called too late. See #6421. + */ + UpdateNextCheck(); + + bool reachable = IsReachable(); + + { + ObjectLock olock(this); + + /* don't run another check if there is one pending */ + if (m_CheckRunning) + return; + + m_CheckRunning = true; + + SetLastStateRaw(GetStateRaw()); + SetLastStateType(GetLastStateType()); + SetLastReachable(reachable); + } + + CheckResult::Ptr cr = new CheckResult(); + + cr->SetScheduleStart(scheduled_start); + cr->SetExecutionStart(before_check); + + Endpoint::Ptr endpoint = GetCommandEndpoint(); + bool local = !endpoint || endpoint == Endpoint::GetLocalEndpoint(); + + if (local) { + GetCheckCommand()->Execute(this, cr, nullptr, false); + } else { + Dictionary::Ptr macros = new Dictionary(); + GetCheckCommand()->Execute(this, cr, macros, false); + + if (endpoint->GetConnected()) { + /* perform check on remote endpoint */ + Dictionary::Ptr message = new Dictionary(); + message->Set("jsonrpc", "2.0"); + message->Set("method", "event::ExecuteCommand"); + + Host::Ptr host; + Service::Ptr service; + tie(host, service) = GetHostService(this); + + Dictionary::Ptr params = new Dictionary(); + message->Set("params", params); + params->Set("command_type", "check_command"); + params->Set("command", GetCheckCommand()->GetName()); + params->Set("host", host->GetName()); + + if (service) + params->Set("service", service->GetShortName()); + + /* + * If the host/service object specifies the 'check_timeout' attribute, + * forward this to the remote endpoint to limit the command execution time. + */ + if (!GetCheckTimeout().IsEmpty()) + params->Set("check_timeout", GetCheckTimeout()); + + params->Set("macros", macros); + + ApiListener::Ptr listener = ApiListener::GetInstance(); + + if (listener) + listener->SyncSendMessage(endpoint, message); + + /* Re-schedule the check so we don't run it again until after we've received + * a check result from the remote instance. The check will be re-scheduled + * using the proper check interval once we've received a check result. + */ + SetNextCheck(Utility::GetTime() + GetCheckCommand()->GetTimeout() + 30); + + /* + * Let the user know that there was a problem with the check if + * 1) The endpoint is not syncing (replay log, etc.) + * 2) Outside of the cold startup window (5min) + */ + } else if (!endpoint->GetSyncing() && Application::GetInstance()->GetStartTime() < Utility::GetTime() - 300) { + /* fail to perform check on unconnected endpoint */ + cr->SetState(ServiceUnknown); + + String output = "Remote Icinga instance '" + endpoint->GetName() + "' is not connected to "; + + Endpoint::Ptr localEndpoint = Endpoint::GetLocalEndpoint(); + + if (localEndpoint) + output += "'" + localEndpoint->GetName() + "'"; + else + output += "this instance"; + + cr->SetOutput(output); + + ProcessCheckResult(cr); + } + + { + ObjectLock olock(this); + m_CheckRunning = false; + } + } +} + +void Checkable::UpdateStatistics(const CheckResult::Ptr& cr, CheckableType type) +{ + time_t ts = cr->GetScheduleEnd(); + + if (type == CheckableHost) { + if (cr->GetActive()) + CIB::UpdateActiveHostChecksStatistics(ts, 1); + else + CIB::UpdatePassiveHostChecksStatistics(ts, 1); + } else if (type == CheckableService) { + if (cr->GetActive()) + CIB::UpdateActiveServiceChecksStatistics(ts, 1); + else + CIB::UpdatePassiveServiceChecksStatistics(ts, 1); + } else { + Log(LogWarning, "Checkable", "Unknown checkable type for statistic update."); + } +} + +void Checkable::IncreasePendingChecks() +{ + std::unique_lock<std::mutex> lock(m_StatsMutex); + m_PendingChecks++; +} + +void Checkable::DecreasePendingChecks() +{ + std::unique_lock<std::mutex> lock(m_StatsMutex); + m_PendingChecks--; + m_PendingChecksCV.notify_one(); +} + +int Checkable::GetPendingChecks() +{ + std::unique_lock<std::mutex> lock(m_StatsMutex); + return m_PendingChecks; +} + +void Checkable::AquirePendingCheckSlot(int maxPendingChecks) +{ + std::unique_lock<std::mutex> lock(m_StatsMutex); + while (m_PendingChecks >= maxPendingChecks) + m_PendingChecksCV.wait(lock); + + m_PendingChecks++; +} |