Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions backend/internal/service/openai_account_scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,69 @@ func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_FreshUsageWind
require.Equal(t, int64(35602), account.ID)
}

// Issue #2994: an account poisoned with an inflated used% (e.g. from the reverted #2918
// inversion) gets excluded from scheduling, and a paused account never receives traffic to
// refresh its snapshot. When the snapshot is stale (codex_usage_updated_at older than the
// staleness bound) the account must be allowed a request so it can self-heal from the real
// response headers — independent of the window's reset time.
func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_StaleUsageSnapshotSkipsPause_Issue2994(t *testing.T) {
ctx := context.Background()
primary := Account{
ID: 35701,
Platform: PlatformOpenAI,
Type: AccountTypeAPIKey,
Status: StatusActive,
Schedulable: true,
Concurrency: 1,
Priority: 0,
Extra: map[string]any{
"codex_5h_used_percent": 99.0,
"auto_pause_5h_threshold": 0.95,
// Window has NOT reset yet, so the reset guard stays inactive.
"codex_5h_reset_at": time.Now().Add(time.Hour).Format(time.RFC3339),
// Snapshot is stale: older than openAICodexAutoPauseStaleAfter (2h).
"codex_usage_updated_at": time.Now().Add(-3 * time.Hour).Format(time.RFC3339),
Comment on lines +931 to +933
},
}
secondary := Account{ID: 35702, Platform: PlatformOpenAI, Type: AccountTypeAPIKey, Status: StatusActive, Schedulable: true, Concurrency: 1, Priority: 5}
svc := &OpenAIGatewayService{accountRepo: schedulerTestOpenAIAccountRepo{accounts: []Account{primary, secondary}}, cfg: &config.Config{}}

account, err := svc.SelectAccountForModelWithExclusions(ctx, nil, "", "gpt-5.1", nil)
require.NoError(t, err)
require.NotNil(t, account)
require.Equal(t, int64(35701), account.ID)
}

// Issue #2994 guardrail: a genuinely-exhausted account whose snapshot was refreshed recently
// (codex_usage_updated_at fresh) must STILL be auto-paused. The stale self-heal must not let a
// real 99%-used account escape pause.
func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_FreshExhaustedSnapshotStillPauses_Issue2994(t *testing.T) {
ctx := context.Background()
primary := Account{
ID: 35801,
Platform: PlatformOpenAI,
Type: AccountTypeAPIKey,
Status: StatusActive,
Schedulable: true,
Concurrency: 1,
Priority: 0,
Extra: map[string]any{
"codex_5h_used_percent": 99.0,
"auto_pause_5h_threshold": 0.95,
"codex_5h_reset_at": time.Now().Add(time.Hour).Format(time.RFC3339),
// Snapshot refreshed 1 minute ago: not stale, so the account stays paused.
"codex_usage_updated_at": time.Now().Add(-time.Minute).Format(time.RFC3339),
},
}
secondary := Account{ID: 35802, Platform: PlatformOpenAI, Type: AccountTypeAPIKey, Status: StatusActive, Schedulable: true, Concurrency: 1, Priority: 5}
svc := &OpenAIGatewayService{accountRepo: schedulerTestOpenAIAccountRepo{accounts: []Account{primary, secondary}}, cfg: &config.Config{}}

account, err := svc.SelectAccountForModelWithExclusions(ctx, nil, "", "gpt-5.1", nil)
require.NoError(t, err)
require.NotNil(t, account)
require.Equal(t, int64(35802), account.ID)
}

func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_SkipsFreshlyRateLimitedSnapshotCandidate(t *testing.T) {
ctx := context.Background()
groupID := int64(10102)
Expand Down
31 changes: 31 additions & 0 deletions backend/internal/service/openai_gateway_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ const (
codexCLIVersion = "0.125.0"
// Codex 限额快照仅用于后台展示/诊断,不需要每个成功请求都立即落库。
openAICodexSnapshotPersistMinInterval = 30 * time.Second
// 配额自动暂停时,超过该时长仍未刷新的 used% 快照视为陈旧,不再据此暂停账号。
// 被暂停的账号收不到流量,其快照永远不会从上游响应头刷新;该兜底让账号在快照
// 陈旧时放行一次请求,从而通过正常响应头自愈,而无需等待整个窗口(5h/7d)重置。
openAICodexAutoPauseStaleAfter = 2 * time.Hour
)

// OpenAI allowed headers whitelist (for non-passthrough).
Expand Down Expand Up @@ -1484,9 +1488,36 @@ func resolveOpenAIQuotaUtilization(extra map[string]any, window string, now time
if openAIQuotaWindowReset(extra, window, now) {
return 0, false
}
// 快照过于陈旧(账号长期未收到流量刷新)时,不再据此暂停。放行后下一次响应头
// 会刷新快照实现自愈,避免账号在错误/过期的 used% 上被永久跳过(issue #2994)。
if openAICodexSnapshotStaleForPause(extra, now) {
return 0, false
}
return usedPercent / 100, true
}

// openAICodexSnapshotStaleForPause reports whether the Codex usage snapshot is stale
// enough that it should no longer keep an account auto-paused. It anchors on
// codex_usage_updated_at (always written by buildCodexUsageExtraUpdates). A missing or
// unparseable timestamp returns false (treated as fresh, so the account stays paused) —
// this is deliberate: it prevents any snapshot without a write time from silently escaping
// auto-pause, and a genuinely-exhausted account that is actively served refreshes the
// timestamp on every response so it never crosses the staleness bound.
func openAICodexSnapshotStaleForPause(extra map[string]any, now time.Time) bool {
if len(extra) == 0 {
return false
}
updatedRaw, ok := extra["codex_usage_updated_at"]
if !ok {
return false
}
updatedAt, err := parseTime(fmt.Sprint(updatedRaw))
if err != nil {
return false
}
Comment on lines +1514 to +1517
return now.Sub(updatedAt) >= openAICodexAutoPauseStaleAfter
}

// openAIQuotaWindowReset reports whether the Codex usage window's reset time has
// already passed relative to now. It prefers the absolute codex_<window>_reset_at
// timestamp and falls back to codex_<window>_reset_after_seconds anchored at
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,39 @@ func TestBuildCodexUsageExtraUpdates_UsesSnapshotUpdatedAt(t *testing.T) {
}
}

// TestBuildCodexUsageExtraUpdates_FreshAccountUsedPercentNotInverted_Issue2994 locks in the
// canonical "used %" semantics for the 5h window. A fresh account reports a tiny
// secondary-used-percent (~1%); the stored codex_5h_used_percent must equal that value
// directly and must NOT be inverted to ~99%. Regression guard for issue #2994 / the reverted
// commit b65dde63 (PR #2918), which applied `100 - used` and made fresh accounts look
// exhausted, tripping auto-pause and excluding them from scheduling.
func TestBuildCodexUsageExtraUpdates_FreshAccountUsedPercentNotInverted_Issue2994(t *testing.T) {
secondaryUsed := 1.0 // 5h window: barely used
secondaryWindow := 300
primaryUsed := 2.0 // 7d window: barely used
primaryWindow := 10080

snapshot := &OpenAICodexUsageSnapshot{
PrimaryUsedPercent: &primaryUsed,
PrimaryWindowMinutes: &primaryWindow,
SecondaryUsedPercent: &secondaryUsed,
SecondaryWindowMinutes: &secondaryWindow,
UpdatedAt: "2026-02-16T10:00:00Z",
}

updates := buildCodexUsageExtraUpdates(snapshot, time.Date(2026, 2, 16, 10, 0, 0, 0, time.UTC))
if updates == nil {
t.Fatal("expected non-nil updates")
}

if got := updates["codex_5h_used_percent"]; got != 1.0 {
t.Fatalf("codex_5h_used_percent = %v, want 1.0 (direct used%%, NOT inverted to 99)", got)
}
if got := updates["codex_7d_used_percent"]; got != 2.0 {
t.Fatalf("codex_7d_used_percent = %v, want 2.0 (direct used%%, NOT inverted to 98)", got)
}
}

func TestBuildCodexUsageExtraUpdates_FallbackToNowWhenUpdatedAtInvalid(t *testing.T) {
primaryUsed := 15.0
primaryReset := 30
Expand Down
Loading