summaryrefslogtreecommitdiffstats
path: root/pkg/retry/retry.go
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--pkg/retry/retry.go141
1 files changed, 77 insertions, 64 deletions
diff --git a/pkg/retry/retry.go b/pkg/retry/retry.go
index da73943..e5b93de 100644
--- a/pkg/retry/retry.go
+++ b/pkg/retry/retry.go
@@ -7,12 +7,15 @@ import (
"github.com/icinga/icingadb/pkg/backoff"
"github.com/lib/pq"
"github.com/pkg/errors"
+ "io"
"net"
- "strings"
"syscall"
"time"
)
+// DefaultTimeout is our opinionated default timeout for retrying database and Redis operations.
+const DefaultTimeout = 5 * time.Minute
+
// RetryableFunc is a retryable function.
type RetryableFunc func(context.Context) error
@@ -21,10 +24,15 @@ type IsRetryable func(error) bool
// Settings aggregates optional settings for WithBackoff.
type Settings struct {
- // Timeout lets WithBackoff give up once elapsed (if >0).
+ // If >0, Timeout lets WithBackoff stop retrying gracefully once elapsed based on the following criteria:
+ // * If the execution of RetryableFunc has taken longer than Timeout, no further attempts are made.
+ // * If Timeout elapses during the sleep phase between retries, one final retry is attempted.
+ // * RetryableFunc is always granted its full execution time and is not canceled if it exceeds Timeout.
+ // This means that WithBackoff may not stop exactly after Timeout expires,
+ // or may not retry at all if the first execution of RetryableFunc already takes longer than Timeout.
Timeout time.Duration
- // OnError is called if an error occurs.
- OnError func(elapsed time.Duration, attempt uint64, err, lastErr error)
+ // OnRetryableError is called if a retryable error occurs.
+ OnRetryableError func(elapsed time.Duration, attempt uint64, err, lastErr error)
// OnSuccess is called once the operation succeeds.
OnSuccess func(elapsed time.Duration, attempt uint64, lastErr error)
}
@@ -34,16 +42,19 @@ type Settings struct {
func WithBackoff(
ctx context.Context, retryableFunc RetryableFunc, retryable IsRetryable, b backoff.Backoff, settings Settings,
) (err error) {
- parentCtx := ctx
+ // Channel for retry deadline, which is set to the channel of NewTimer() if a timeout is configured,
+ // otherwise nil, so that it blocks forever if there is no timeout.
+ var timeout <-chan time.Time
if settings.Timeout > 0 {
- var cancelCtx context.CancelFunc
- ctx, cancelCtx = context.WithTimeout(ctx, settings.Timeout)
- defer cancelCtx()
+ t := time.NewTimer(settings.Timeout)
+ defer t.Stop()
+ timeout = t.C
}
start := time.Now()
- for attempt := uint64(0); ; /* true */ attempt++ {
+ timedOut := false
+ for attempt := uint64(1); ; /* true */ attempt++ {
prevErr := err
if err = retryableFunc(ctx); err == nil {
@@ -54,43 +65,72 @@ func WithBackoff(
return
}
- if settings.OnError != nil {
- settings.OnError(time.Since(start), attempt, err, prevErr)
+ // Retryable function may have exited prematurely due to context errors.
+ // We explicitly check the context error here, as the error returned by the retryable function can pass the
+ // error.Is() checks even though it is not a real context error, e.g.
+ // https://cs.opensource.google/go/go/+/refs/tags/go1.22.2:src/net/net.go;l=422
+ // https://cs.opensource.google/go/go/+/refs/tags/go1.22.2:src/net/net.go;l=601
+ if errors.Is(ctx.Err(), context.DeadlineExceeded) || errors.Is(ctx.Err(), context.Canceled) {
+ if prevErr != nil {
+ err = errors.Wrap(err, prevErr.Error())
+ }
+
+ return
}
- isRetryable := retryable(err)
+ if !retryable(err) {
+ err = errors.Wrap(err, "can't retry")
- if prevErr != nil && (errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
- err = prevErr
+ return
}
- if !isRetryable {
- err = errors.Wrap(err, "can't retry")
+ select {
+ case <-timeout:
+ // Stop retrying immediately if executing the retryable function took longer than the timeout.
+ timedOut = true
+ default:
+ }
+
+ if timedOut {
+ err = errors.Wrap(err, "retry deadline exceeded")
return
}
- sleep := b(attempt)
+ if settings.OnRetryableError != nil {
+ settings.OnRetryableError(time.Since(start), attempt, err, prevErr)
+ }
+
select {
+ case <-time.After(b(attempt)):
+ case <-timeout:
+ // Do not stop retrying immediately, but start one last attempt to mitigate timing issues where
+ // the timeout expires while waiting for the next attempt and
+ // therefore no retries have happened during this possibly long period.
+ timedOut = true
case <-ctx.Done():
- if outerErr := parentCtx.Err(); outerErr != nil {
- err = errors.Wrap(outerErr, "outer context canceled")
- } else {
- if err == nil {
- err = ctx.Err()
- }
- err = errors.Wrap(err, "can't retry")
- }
+ err = errors.Wrap(ctx.Err(), err.Error())
return
- case <-time.After(sleep):
}
}
}
+// ResetTimeout changes the possibly expired timer t to expire after duration d.
+//
+// If the timer has already expired and nothing has been received from its channel,
+// it is automatically drained as if the timer had never expired.
+func ResetTimeout(t *time.Timer, d time.Duration) {
+ if !t.Stop() {
+ <-t.C
+ }
+
+ t.Reset(d)
+}
+
// Retryable returns true for common errors that are considered retryable,
// i.e. temporary, timeout, DNS, connection refused and reset, host down and unreachable and
-// network down and unreachable errors.
+// network down and unreachable errors. In addition, any database error is considered retryable.
func Retryable(err error) bool {
var temporary interface {
Temporary() bool
@@ -133,6 +173,12 @@ func Retryable(err error) bool {
if errors.Is(err, syscall.ENETDOWN) || errors.Is(err, syscall.ENETUNREACH) {
return true
}
+ if errors.Is(err, syscall.EPIPE) {
+ return true
+ }
+ if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+ return true
+ }
if errors.Is(err, driver.ErrBadConn) {
return true
@@ -141,43 +187,10 @@ func Retryable(err error) bool {
return true
}
- var e *mysql.MySQLError
- if errors.As(err, &e) {
- switch e.Number {
- case 1053, 1205, 1213, 2006:
- // 1053: Server shutdown in progress
- // 1205: Lock wait timeout
- // 1213: Deadlock found when trying to get lock
- // 2006: MySQL server has gone away
- return true
- default:
- return false
- }
- }
-
- var pe *pq.Error
- if errors.As(err, &pe) {
- switch pe.Code {
- case "08000", // connection_exception
- "08006", // connection_failure
- "08001", // sqlclient_unable_to_establish_sqlconnection
- "08004", // sqlserver_rejected_establishment_of_sqlconnection
- "40001", // serialization_failure
- "40P01", // deadlock_detected
- "54000", // program_limit_exceeded
- "55006", // object_in_use
- "55P03", // lock_not_available
- "57P01", // admin_shutdown
- "57P02", // crash_shutdown
- "57P03", // cannot_connect_now
- "58000", // system_error
- "58030", // io_error
- "XX000": // internal_error
- return true
- default:
- // Class 53 - Insufficient Resources
- return strings.HasPrefix(string(pe.Code), "53")
- }
+ var mye *mysql.MySQLError
+ var pqe *pq.Error
+ if errors.As(err, &mye) || errors.As(err, &pqe) {
+ return true
}
return false