Skip to content

Commit 84e84de

Browse files
committed
clientv3: backoff resetting LeaseKeepAlive stream
A large number of client leases can cause cascading failures within the etcd cluster. Currently, when the keepalive stream has an error we will always wait 500ms and then try to recreate the stream with LeaseKeepAlive(). Since there is no backoff or jitter, if the lease streams originally broke due to overload on the servers the retries can cause a cascading failure and put more load on the servers. We can backoff and jitter -- similar to what is done in watch streams -- in order to alleviate server load in the case where leases are causing the overload. Signed-off-by: Elias Carter <[email protected]>
1 parent 8a4955b commit 84e84de

File tree

2 files changed

+22
-4
lines changed

2 files changed

+22
-4
lines changed

client/v3/lease.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,12 @@ const (
8282
// NoLease is a lease ID for the absence of a lease.
8383
NoLease LeaseID = 0
8484

85-
// retryConnWait is how long to wait before retrying request due to an error
86-
retryConnWait = 500 * time.Millisecond
85+
// retryConnMinBackoff is the starting backoff when retrying a request due to an error
86+
retryConnMinBackoff = 500 * time.Millisecond
87+
// retryConnMaxBackoff is the max backoff when retrying a request due to an error
88+
retryConnMaxBackoff = 15 * time.Second
89+
// sendKeepaliveFrequency is how often to send keepalives
90+
sendKeepaliveFrequency = 500 * time.Millisecond
8791
)
8892

8993
// LeaseResponseChSize is the size of buffer to store unsent lease responses.
@@ -458,16 +462,19 @@ func (l *lessor) recvKeepAliveLoop() (gerr error) {
458462
l.mu.Unlock()
459463
}()
460464

465+
backoffGeneration := 0
461466
for {
462467
stream, err := l.resetRecv()
463468
if err != nil {
469+
backoffGeneration++
464470
l.lg.Warn("error occurred during lease keep alive loop",
465471
zap.Error(err),
466472
)
467473
if canceledByCaller(l.stopCtx, err) {
468474
return err
469475
}
470476
} else {
477+
backoffGeneration = 0
471478
for {
472479
resp, err := stream.Recv()
473480
if err != nil {
@@ -485,8 +492,10 @@ func (l *lessor) recvKeepAliveLoop() (gerr error) {
485492
}
486493
}
487494

495+
backoff := jitterUp(expBackoff(backoffGeneration, retryConnMinBackoff, retryConnMaxBackoff), 0.5)
496+
488497
select {
489-
case <-time.After(retryConnWait):
498+
case <-time.After(backoff):
490499
case <-l.stopCtx.Done():
491500
return l.stopCtx.Err()
492501
}
@@ -607,7 +616,7 @@ func (l *lessor) sendKeepAliveLoop(stream pb.Lease_LeaseKeepAliveClient) {
607616
}
608617

609618
select {
610-
case <-time.After(retryConnWait):
619+
case <-time.After(sendKeepaliveFrequency):
611620
case <-stream.Context().Done():
612621
return
613622
case <-l.donec:

client/v3/utils.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package clientv3
1616

1717
import (
18+
"math"
1819
"math/rand"
1920
"time"
2021
)
@@ -29,3 +30,11 @@ func jitterUp(duration time.Duration, jitter float64) time.Duration {
2930
multiplier := jitter * (rand.Float64()*2 - 1)
3031
return time.Duration(float64(duration) * (1 + multiplier))
3132
}
33+
34+
// expBackoff returns an exponential backoff duration.
35+
//
36+
// This will double the duration each generation and clamp between [minDelay, maxDelay]
37+
func expBackoff(generation int, minDelay, maxDelay time.Duration) time.Duration {
38+
delay := math.Min(math.Pow(2, float64(generation))*float64(minDelay), float64(maxDelay))
39+
return time.Duration(delay)
40+
}

0 commit comments

Comments
 (0)