summaryrefslogtreecommitdiffstats
path: root/tests/sla_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'tests/sla_test.go')
-rw-r--r--tests/sla_test.go385
1 files changed, 385 insertions, 0 deletions
diff --git a/tests/sla_test.go b/tests/sla_test.go
new file mode 100644
index 0000000..6fa3a0e
--- /dev/null
+++ b/tests/sla_test.go
@@ -0,0 +1,385 @@
+package icingadb_test
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "github.com/icinga/icinga-testing/utils"
+ "github.com/icinga/icinga-testing/utils/eventually"
+ "github.com/jmoiron/sqlx"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ "go.uber.org/zap"
+ "math"
+ "net/http"
+ "testing"
+ "time"
+)
+
+func TestSla(t *testing.T) {
+ m := it.MysqlDatabaseT(t)
+ m.ImportIcingaDbSchema()
+
+ r := it.RedisServerT(t)
+ i := it.Icinga2NodeT(t, "master")
+ i.EnableIcingaDb(r)
+ err := i.Reload()
+ require.NoError(t, err, "icinga2 should reload without error")
+ it.IcingaDbInstanceT(t, r, m)
+
+ client := i.ApiClient()
+
+ t.Run("StateEvents", func(t *testing.T) {
+ t.Parallel()
+
+ hostname := utils.UniqueName(t, "host")
+ client.CreateHost(t, hostname, map[string]interface{}{
+ "attrs": map[string]interface{}{
+ "enable_active_checks": false,
+ "enable_passive_checks": true,
+ "check_command": "dummy",
+ "max_check_attempts": 3,
+ },
+ })
+
+ type StateChange struct {
+ Time float64
+ State int
+ }
+
+ var stateChanges []StateChange
+
+ processCheckResult := func(exitStatus int, isHard bool) *ObjectsHostsResponse {
+ time.Sleep(10 * time.Millisecond) // ensure there is a bit of difference in ms resolution
+
+ output := utils.UniqueName(t, "output")
+ data := ActionsProcessCheckResultRequest{
+ Type: "Host",
+ Filter: fmt.Sprintf(`host.name==%q`, hostname),
+ ExitStatus: exitStatus,
+ PluginOutput: output,
+ }
+ dataJson, err := json.Marshal(data)
+ require.NoError(t, err, "marshal request")
+ response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(dataJson))
+ require.NoError(t, err, "process-check-result")
+ require.Equal(t, 200, response.StatusCode, "process-check-result")
+
+ response, err = client.GetJson("/v1/objects/hosts/" + hostname)
+ require.NoError(t, err, "get host: request")
+ require.Equal(t, 200, response.StatusCode, "get host: request")
+
+ var hosts ObjectsHostsResponse
+ err = json.NewDecoder(response.Body).Decode(&hosts)
+ require.NoError(t, err, "get host: parse response")
+
+ require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
+ host := hosts.Results[0]
+ require.Equal(t, output, host.Attrs.LastCheckResult.Output,
+ "last check result should be visible in host object")
+ require.Equal(t, exitStatus, host.Attrs.State, "soft state should match check result")
+
+ if isHard {
+ require.Equal(t, exitStatus, host.Attrs.LastHardState, "hard state should match check result")
+ if len(stateChanges) > 0 {
+ require.Greater(t, host.Attrs.LastHardStateChange, stateChanges[len(stateChanges)-1].Time,
+ "last_hard_state_change_time of host should have changed")
+ }
+ stateChanges = append(stateChanges, StateChange{
+ Time: host.Attrs.LastHardStateChange,
+ State: exitStatus,
+ })
+ } else {
+ require.NotEmpty(t, stateChanges, "there should be a hard state change prior to a soft one")
+ require.Equal(t, stateChanges[len(stateChanges)-1].Time, host.Attrs.LastHardStateChange,
+ "check result should not lead to a hard state change, i.e. last_hard_state_change should not change")
+ }
+
+ return &hosts
+ }
+
+ processCheckResult(0, true) // hard (UNKNOWN -> UP)
+ processCheckResult(1, false) // soft
+ processCheckResult(1, false) // soft
+ processCheckResult(1, true) // hard (UP -> DOWN)
+ processCheckResult(1, false) // hard
+ processCheckResult(0, true) // hard (DOWN -> UP)
+ processCheckResult(0, false) // hard
+
+ assert.Equal(t, 3, len(stateChanges), "there should be three hard state changes")
+
+ db, err := sqlx.Connect("mysql", m.DSN())
+ require.NoError(t, err, "connecting to mysql")
+ defer func() { _ = db.Close() }()
+
+ type Row struct {
+ Time int64 `db:"event_time"`
+ State int `db:"hard_state"`
+ }
+
+ eventually.Assert(t, func(t require.TestingT) {
+ var rows []Row
+ err = db.Select(&rows, db.Rebind("SELECT s.event_time, s.hard_state FROM sla_history_state s "+
+ "JOIN host ON host.id = s.host_id WHERE host.name = ? ORDER BY event_time ASC"), hostname)
+ require.NoError(t, err, "select sla_history_state")
+
+ assert.Equal(t, len(stateChanges), len(rows), "number of sla_history_state entries")
+
+ for i := range rows {
+ assert.WithinDuration(t, time.UnixMilli(int64(stateChanges[i].Time*1000)), time.UnixMilli(rows[i].Time),
+ time.Millisecond, "event time should match state change time")
+ assert.Equal(t, stateChanges[i].State, rows[i].State, "hard state should match")
+ }
+ }, 5*time.Second, 200*time.Millisecond)
+
+ redis := r.Open()
+ defer func() { _ = redis.Close() }()
+
+ logger := it.Logger(t)
+
+ logger.Debug("redis state history", zap.Bool("before", true))
+ eventually.Assert(t, func(t require.TestingT) {
+ result, err := redis.XRange(context.Background(), "icinga:history:stream:state", "-", "+").Result()
+ require.NoError(t, err, "reading state history stream should not fail")
+ logger.Debug("redis state history", zap.Any("values", result))
+ assert.Empty(t, result, "redis state history stream should be drained")
+ }, 5*time.Second, 10*time.Millisecond)
+ logger.Debug("redis state history", zap.Bool("after", true))
+ })
+
+ t.Run("DowntimeEvents", func(t *testing.T) {
+ t.Parallel()
+
+ type Options struct {
+ Fixed bool // Whether to schedule a fixed or flexible downtime.
+ Cancel bool // Whether to cancel the downtime or let it expire.
+ }
+
+ downtimeTest := func(t *testing.T, o Options) {
+ hostname := utils.UniqueName(t, "host")
+ client.CreateHost(t, hostname, map[string]interface{}{
+ "attrs": map[string]interface{}{
+ "enable_active_checks": false,
+ "enable_passive_checks": true,
+ "check_command": "dummy",
+ "max_check_attempts": 1,
+ },
+ })
+
+ processCheckResult := func(status int) time.Time {
+ output := utils.RandomString(8)
+ reqBody, err := json.Marshal(ActionsProcessCheckResultRequest{
+ Type: "Host",
+ Filter: fmt.Sprintf(`host.name==%q`, hostname),
+ ExitStatus: status,
+ PluginOutput: output,
+ })
+ require.NoError(t, err, "marshal request")
+ response, err := client.PostJson("/v1/actions/process-check-result", bytes.NewBuffer(reqBody))
+ require.NoError(t, err, "process-check-result")
+ require.Equal(t, 200, response.StatusCode, "process-check-result")
+
+ response, err = client.GetJson("/v1/objects/hosts/" + hostname)
+ require.NoError(t, err, "get host: request")
+ require.Equal(t, 200, response.StatusCode, "get host: request")
+
+ var hosts ObjectsHostsResponse
+ err = json.NewDecoder(response.Body).Decode(&hosts)
+ require.NoError(t, err, "get host: parse response")
+
+ require.Equal(t, 1, len(hosts.Results), "there must be one host in the response")
+ host := hosts.Results[0]
+ require.Equal(t, output, host.Attrs.LastCheckResult.Output,
+ "last check result should be visible in host object")
+ require.Equal(t, 1, host.Attrs.StateType, "host should be in hard state")
+ require.Equal(t, status, host.Attrs.State, "state should match check result")
+
+ sec, nsec := math.Modf(host.Attrs.LastCheckResult.ExecutionEnd)
+ return time.Unix(int64(sec), int64(nsec*1e9))
+ }
+
+ // Ensure that host is in UP state.
+ processCheckResult(0)
+
+ refTime := time.Now().Truncate(time.Second)
+ // Schedule the downtime start in the past so that we would notice if Icinga 2/DB would
+ // use the current time somewhere where we expect the scheduled start time.
+ downtimeStart := refTime.Add(-1 * time.Hour)
+ var downtimeEnd time.Time
+ if o.Cancel || !o.Fixed {
+ // Downtimes we will cancel can expire long in the future as we don't have to wait for it.
+ // Same for flexible downtimes as for these, we don't have to wait until the scheduled end but only
+ // for their duration.
+ downtimeEnd = refTime.Add(1 * time.Hour)
+ } else {
+ // Let all other downtimes expire soon (fixed downtimes where we wait for expiry).
+ downtimeEnd = refTime.Add(5 * time.Second)
+ }
+
+ var duration time.Duration
+ if !o.Fixed {
+ duration = 10 * time.Second
+ }
+ req, err := json.Marshal(ActionsScheduleDowntimeRequest{
+ Type: "Host",
+ Filter: fmt.Sprintf(`host.name==%q`, hostname),
+ StartTime: downtimeStart.Unix(),
+ EndTime: downtimeEnd.Unix(),
+ Fixed: o.Fixed,
+ Duration: duration.Seconds(),
+ Author: utils.RandomString(8),
+ Comment: utils.RandomString(8),
+ })
+ require.NoError(t, err, "marshal request")
+ response, err := client.PostJson("/v1/actions/schedule-downtime", bytes.NewBuffer(req))
+ require.NoError(t, err, "schedule-downtime")
+ require.Equal(t, 200, response.StatusCode, "schedule-downtime")
+
+ var scheduleResponse ActionsScheduleDowntimeResponse
+ err = json.NewDecoder(response.Body).Decode(&scheduleResponse)
+ require.NoError(t, err, "decode schedule-downtime response")
+ require.Equal(t, 1, len(scheduleResponse.Results), "schedule-downtime should return 1 result")
+ require.Equal(t, http.StatusOK, scheduleResponse.Results[0].Code, "schedule-downtime should return 1 result")
+ downtimeName := scheduleResponse.Results[0].Name
+
+ type Row struct {
+ Start int64 `db:"downtime_start"`
+ End int64 `db:"downtime_end"`
+ }
+
+ db, err := sqlx.Connect("mysql", m.DSN())
+ require.NoError(t, err, "connecting to mysql")
+ defer func() { _ = db.Close() }()
+
+ if !o.Fixed {
+ // Give Icinga 2 and Icinga DB some time that if they would generate an SLA history event in error,
+ // they have a chance to do so before we check for its absence.
+ time.Sleep(10 * time.Second)
+
+ var count int
+ err = db.Get(&count, db.Rebind("SELECT COUNT(*) FROM sla_history_downtime s "+
+ "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
+ require.NoError(t, err, "select sla_history_state")
+ assert.Zero(t, count, "there should be no event in sla_history_downtime when scheduling a flexible downtime on an UP host")
+ }
+
+ // Bring host into DOWN state.
+ criticalTime := processCheckResult(1)
+
+ eventually.Assert(t, func(t require.TestingT) {
+ var rows []Row
+ err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
+ "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
+ require.NoError(t, err, "select sla_history_state")
+
+ require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
+ if o.Fixed {
+ assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
+ "downtime_start should match scheduled start time")
+ assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
+ "downtime_end should match scheduled end time")
+ } else {
+ assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
+ "downtime_start should match time of host state change")
+ assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
+ "downtime_end - downtime_start duration should match scheduled duration")
+ }
+ }, 5*time.Second, 200*time.Millisecond)
+
+ redis := r.Open()
+ defer func() { _ = redis.Close() }()
+
+ eventually.Assert(t, func(t require.TestingT) {
+ result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
+ require.NoError(t, err, "reading downtime history stream should not fail")
+ assert.Empty(t, result, "redis downtime history stream should be drained")
+ }, 5*time.Second, 10*time.Millisecond)
+
+ if o.Cancel {
+ req, err = json.Marshal(ActionsRemoveDowntimeRequest{
+ Downtime: downtimeName,
+ })
+ require.NoError(t, err, "marshal remove-downtime request")
+ response, err = client.PostJson("/v1/actions/remove-downtime", bytes.NewBuffer(req))
+ require.NoError(t, err, "remove-downtime")
+ require.Equal(t, 200, response.StatusCode, "remove-downtime")
+ }
+
+ downtimeCancel := time.Now()
+
+ if !o.Cancel {
+ // Wait for downtime to expire + a few extra seconds. The row should not be updated, give
+ // enough time to have a chance catching if Icinga DB updates it nonetheless.
+ if !o.Fixed {
+ time.Sleep(duration + 5*time.Second)
+ } else {
+ d := time.Until(downtimeEnd) + 5*time.Second
+ require.Less(t, d, time.Minute, "bug in tests: don't wait too long")
+ time.Sleep(d)
+ }
+ }
+
+ eventually.Assert(t, func(t require.TestingT) {
+ var rows []Row
+ err = db.Select(&rows, db.Rebind("SELECT s.downtime_start, s.downtime_end FROM sla_history_downtime s "+
+ "JOIN host ON host.id = s.host_id WHERE host.name = ?"), hostname)
+ require.NoError(t, err, "select sla_history_state")
+
+ require.Equal(t, 1, len(rows), "there should be exactly one sla_history_downtime row")
+ if o.Fixed {
+ assert.Equal(t, downtimeStart, time.UnixMilli(rows[0].Start),
+ "downtime_start should match scheduled start")
+ } else {
+ assert.WithinDuration(t, criticalTime, time.UnixMilli(rows[0].Start), time.Second,
+ "downtime_start should match critical time")
+ }
+ if o.Cancel {
+ // Allow more delta for the end time after cancel as we did not choose the exact time.
+ assert.WithinDuration(t, downtimeCancel, time.UnixMilli(rows[0].End), time.Second,
+ "downtime_end should match cancel time")
+ } else if o.Fixed {
+ assert.Equal(t, downtimeEnd, time.UnixMilli(rows[0].End),
+ "downtime_start should match scheduled end")
+ } else {
+ assert.Equal(t, duration, time.UnixMilli(rows[0].End).Sub(time.UnixMilli(rows[0].Start)),
+ "downtime_end - downtime_start duration should match scheduled duration")
+ }
+ }, 5*time.Second, 200*time.Millisecond)
+
+ eventually.Assert(t, func(t require.TestingT) {
+ result, err := redis.XRange(context.Background(), "icinga:history:stream:downtime", "-", "+").Result()
+ require.NoError(t, err, "reading downtime history stream should not fail")
+ assert.Empty(t, result, "redis downtime history stream should be drained")
+ }, 5*time.Second, 10*time.Millisecond)
+ }
+
+ t.Run("Fixed", func(t *testing.T) {
+ t.Parallel()
+
+ t.Run("Cancel", func(t *testing.T) {
+ t.Parallel()
+ downtimeTest(t, Options{Fixed: true, Cancel: true})
+ })
+
+ t.Run("Expire", func(t *testing.T) {
+ t.Parallel()
+ downtimeTest(t, Options{Fixed: true, Cancel: false})
+ })
+ })
+
+ t.Run("Flexible", func(t *testing.T) {
+ t.Parallel()
+
+ t.Run("Cancel", func(t *testing.T) {
+ t.Parallel()
+ downtimeTest(t, Options{Fixed: false, Cancel: true})
+ })
+
+ t.Run("Expire", func(t *testing.T) {
+ t.Parallel()
+ downtimeTest(t, Options{Fixed: false, Cancel: false})
+ })
+ })
+ })
+}