@@ -148,9 +148,24 @@ type tokenSlot struct {
148148 lastReqTime time.Time
149149}
150150
151+ func (ts * tokenSlot ) logFields () []zap.Field {
152+ return []zap.Field {
153+ zap .Uint64 ("slot-fill-rate" , ts .fillRate ),
154+ zap .Int64 ("slot-burst-limit" , ts .burstLimit ),
155+ zap .Float64 ("slot-require-tokens-sum" , ts .requireTokensSum ),
156+ zap .Float64 ("slot-token-capacity" , ts .tokenCapacity ),
157+ zap .Float64 ("slot-last-token-capacity" , ts .lastTokenCapacity ),
158+ zap .Time ("slot-last-req-time" , ts .lastReqTime ),
159+ }
160+ }
161+
151162// GroupTokenBucketState is the running state of TokenBucket.
152163type GroupTokenBucketState struct {
153- Tokens float64 `json:"tokens,omitempty"`
164+ Tokens float64 `json:"tokens,omitempty"`
165+ LastUpdate * time.Time `json:"last_update,omitempty"`
166+ Initialized bool `json:"initialized"`
167+
168+ resourceGroupName string
154169 // ClientUniqueID -> TokenSlot
155170 tokenSlots map [uint64 ]* tokenSlot
156171 clientConsumptionTokensSum float64
@@ -173,8 +188,6 @@ type GroupTokenBucketState struct {
173188 // means the burst limit is overridden.
174189 overrideBurstLimit int64
175190
176- LastUpdate * time.Time `json:"last_update,omitempty"`
177- Initialized bool `json:"initialized"`
178191 // settingChanged is used to avoid that the number of tokens returned is jitter because of changing fill rate.
179192 settingChanged bool
180193 lastCheckExpireSlot time.Time
@@ -198,6 +211,7 @@ func (gts *GroupTokenBucketState) clone() *GroupTokenBucketState {
198211 Tokens : gts .Tokens ,
199212 LastUpdate : lastUpdate ,
200213 Initialized : gts .Initialized ,
214+ resourceGroupName : gts .resourceGroupName ,
201215 tokenSlots : tokenSlots ,
202216 overrideFillRate : gts .overrideFillRate ,
203217 overrideBurstLimit : gts .overrideBurstLimit ,
@@ -210,16 +224,11 @@ func (gts *GroupTokenBucketState) resetLoan() {
210224 gts .settingChanged = false
211225 gts .Tokens = 0
212226 gts .clientConsumptionTokensSum = 0
213- evenRatio := 1.0
214- if l := len (gts .tokenSlots ); l > 0 {
215- evenRatio = 1 / float64 (l )
216- }
217-
218- evenTokens := gts .Tokens * evenRatio
227+ // Reset all slots.
219228 for _ , slot := range gts .tokenSlots {
220229 slot .requireTokensSum = 0
221- slot .tokenCapacity = evenTokens
222- slot .lastTokenCapacity = evenTokens
230+ slot .tokenCapacity = 0
231+ slot .lastTokenCapacity = 0
223232 }
224233}
225234
@@ -330,14 +339,15 @@ func (gtb *GroupTokenBucket) calcRateAndBurstLimit(ratio float64) (fillRate uint
330339}
331340
332341// NewGroupTokenBucket returns a new GroupTokenBucket
333- func NewGroupTokenBucket (tokenBucket * rmpb.TokenBucket ) * GroupTokenBucket {
342+ func NewGroupTokenBucket (resourceGroupName string , tokenBucket * rmpb.TokenBucket ) * GroupTokenBucket {
334343 if tokenBucket == nil || tokenBucket .Settings == nil {
335344 return & GroupTokenBucket {}
336345 }
337346 return & GroupTokenBucket {
338347 Settings : tokenBucket .GetSettings (),
339348 GroupTokenBucketState : GroupTokenBucketState {
340349 Tokens : tokenBucket .GetTokens (),
350+ resourceGroupName : resourceGroupName ,
341351 tokenSlots : make (map [uint64 ]* tokenSlot ),
342352 overrideFillRate : - 1 ,
343353 overrideBurstLimit : - 1 ,
@@ -418,43 +428,93 @@ func (gtb *GroupTokenBucket) updateTokens(now time.Time, burstLimit int64, clien
418428 gtb .balanceSlotTokens (clientUniqueID , requiredToken , elapseTokens )
419429}
420430
431+ func (gtb * GroupTokenBucket ) inspectAnomalies (
432+ tb * rmpb.TokenBucket ,
433+ slot * tokenSlot ,
434+ logFields []zap.Field ,
435+ ) bool {
436+ var errMsg string
437+ // Verify whether the allocated token is invalid, such as negative values, math.Inf, or math.NaN.
438+ if tb .Tokens <= 0 || math .IsInf (tb .Tokens , 0 ) || math .IsNaN (tb .Tokens ) {
439+ errMsg = "assigned token is invalid"
440+ }
441+ // Verify whether the state of the slot is abnormal.
442+ if math .IsInf (slot .tokenCapacity , 0 ) || math .IsNaN (slot .tokenCapacity ) {
443+ errMsg = "slot token capacity is invalid"
444+ }
445+ // If there is any error, reset the group token bucket to avoid the group token bucket is in a bad state.
446+ isAnomaly := len (errMsg ) > 0
447+ if isAnomaly {
448+ logFields = append (logFields ,
449+ append (
450+ slot .logFields (),
451+ zap .String ("resource-group-name" , gtb .resourceGroupName ),
452+ zap .String ("settings" , gtb .Settings .String ()),
453+ zap .Float64 ("tokens" , gtb .Tokens ),
454+ zap .Float64 ("client-consumption-tokens-sum" , gtb .clientConsumptionTokensSum ),
455+ zap .Int ("slot-len" , len (gtb .tokenSlots )),
456+ )... ,
457+ )
458+ log .Error (errMsg , logFields ... )
459+ // Reset after logging to keep the original context.
460+ gtb .resetLoan ()
461+ }
462+ return isAnomaly
463+ }
464+
421465// request requests tokens from the corresponding slot.
422- func (gtb * GroupTokenBucket ) request (now time.Time ,
466+ func (gtb * GroupTokenBucket ) request (
467+ now time.Time ,
423468 requiredToken float64 ,
424469 targetPeriodMs , clientUniqueID uint64 ,
425470) (* rmpb.TokenBucket , int64 ) {
426471 burstLimit := gtb .getBurstLimit ()
427472 gtb .updateTokens (now , burstLimit , clientUniqueID , requiredToken )
428473 slot , ok := gtb .tokenSlots [clientUniqueID ]
429474 if ! ok {
430- return & rmpb.TokenBucket {Settings : & rmpb.TokenLimitSettings {BurstLimit : burstLimit }}, 0
475+ return & rmpb.TokenBucket {
476+ Settings : & rmpb.TokenLimitSettings {BurstLimit : burstLimit },
477+ Tokens : 0.0 ,
478+ }, 0
431479 }
432480 res , trickleDuration := slot .assignSlotTokens (requiredToken , targetPeriodMs )
481+ // Inspect the group token bucket and the assigned token result to catch any anomalies.
482+ if isAnomaly := gtb .inspectAnomalies (res , slot , []zap.Field {
483+ zap .Time ("now" , now ),
484+ zap .Uint64 ("client-unique-id" , clientUniqueID ),
485+ zap .Uint64 ("target-period-ms" , targetPeriodMs ),
486+ zap .Float64 ("required-token" , requiredToken ),
487+ zap .Float64 ("assigned-tokens" , res .Tokens ),
488+ }); isAnomaly {
489+ // Return nil here to prevent sending any unexpected result to the client.
490+ // The client has to retry later to access the resource group whose state has been reset.
491+ return nil , 0
492+ }
433493 // Update bucket to record all tokens.
434494 gtb .Tokens -= slot .lastTokenCapacity - slot .tokenCapacity
435495 slot .lastTokenCapacity = slot .tokenCapacity
436-
437496 return res , trickleDuration
438497}
439498
440499func (ts * tokenSlot ) assignSlotTokens (requiredToken float64 , targetPeriodMs uint64 ) (* rmpb.TokenBucket , int64 ) {
441- var res rmpb.TokenBucket
442- burstLimit := ts .burstLimit
443- res .Settings = & rmpb.TokenLimitSettings {BurstLimit : burstLimit }
500+ res := & rmpb.TokenBucket {
501+ Settings : & rmpb.TokenLimitSettings {BurstLimit : ts .burstLimit },
502+ Tokens : 0.0 ,
503+ }
444504 if getBurstableMode (res .Settings ) == unlimited {
445505 res .Tokens = requiredToken
446- return & res , 0
506+ return res , 0
447507 }
448508 // FillRate is used for the token server unavailable in abnormal situation.
449509 if requiredToken <= 0 {
450- return & res , 0
510+ return res , 0
451511 }
452512 // If the current tokens can directly meet the requirement, returns the need token.
453513 if ts .tokenCapacity >= requiredToken {
454514 ts .tokenCapacity -= requiredToken
455515 // granted the total request tokens
456516 res .Tokens = requiredToken
457- return & res , 0
517+ return res , 0
458518 }
459519
460520 // Firstly allocate the remaining tokens
@@ -472,6 +532,7 @@ func (ts *tokenSlot) assignSlotTokens(requiredToken float64, targetPeriodMs uint
472532 targetPeriodTimeSec = targetPeriodTime .Seconds ()
473533 trickleTime = 0.
474534 fillRate = ts .fillRate
535+ burstLimit = ts .burstLimit
475536 )
476537
477538 loanCoefficient := defaultLoanCoefficient
@@ -547,5 +608,5 @@ func (ts *tokenSlot) assignSlotTokens(requiredToken float64, targetPeriodMs uint
547608 } else {
548609 trickleDuration = targetPeriodTime
549610 }
550- return & res , trickleDuration .Milliseconds ()
611+ return res , trickleDuration .Milliseconds ()
551612}
0 commit comments