From 86d9b6bff982859e66382e081cae7549df3561e8 Mon Sep 17 00:00:00 2001 From: haruka <1628615876@qq.com> Date: Thu, 4 Jun 2026 22:07:36 +0800 Subject: [PATCH] fix(openai): self-heal stale Codex used% snapshots + lock semantics (#2994) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OpenAI/Codex 5h "used %" inversion that caused fresh accounts to show ~96-99% used (PR #2918, commit b65dde63) was already reverted in #2993, so the stored value is now the correct "used %" again. This commit hardens that fix: 1. Regression test locking in direct "used %" semantics. The semantics have flip-flopped twice (#2918 -> #2993) with no value-level guard — a fresh account (secondary_used_percent=1, 5h window) must store codex_5h_used_percent=1, not 99. 2. Stale-bounded self-heal in resolveOpenAIQuotaUtilization (the single auto-pause chokepoint). An account poisoned with an inflated used% gets excluded from scheduling, and a paused account never receives traffic to refresh its snapshot — so it stayed stuck until the window's reset_at passed (up to 5h/7d). When codex_usage_updated_at is older than 2h, the account is no longer auto-paused on that snapshot; it gets one request whose response headers refresh the snapshot and self-heal it. A missing timestamp is treated as fresh (stays paused), and an actively-served exhausted account refreshes the timestamp every response so it never crosses the bound — it cannot escape auto-pause. No change to Normalize(); no 100-x reintroduced; no new dependency wiring. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../service/openai_account_scheduler_test.go | 63 +++++++++++++++++++ .../service/openai_gateway_service.go | 31 +++++++++ ...nai_gateway_service_codex_snapshot_test.go | 33 ++++++++++ 3 files changed, 127 insertions(+) diff --git a/backend/internal/service/openai_account_scheduler_test.go b/backend/internal/service/openai_account_scheduler_test.go index da5f0a6644b..505a5ade3ba 100644 --- a/backend/internal/service/openai_account_scheduler_test.go +++ b/backend/internal/service/openai_account_scheduler_test.go @@ -909,6 +909,69 @@ func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_FreshUsageWind require.Equal(t, int64(35602), account.ID) } +// Issue #2994: an account poisoned with an inflated used% (e.g. from the reverted #2918 +// inversion) gets excluded from scheduling, and a paused account never receives traffic to +// refresh its snapshot. When the snapshot is stale (codex_usage_updated_at older than the +// staleness bound) the account must be allowed a request so it can self-heal from the real +// response headers — independent of the window's reset time. +func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_StaleUsageSnapshotSkipsPause_Issue2994(t *testing.T) { + ctx := context.Background() + primary := Account{ + ID: 35701, + Platform: PlatformOpenAI, + Type: AccountTypeAPIKey, + Status: StatusActive, + Schedulable: true, + Concurrency: 1, + Priority: 0, + Extra: map[string]any{ + "codex_5h_used_percent": 99.0, + "auto_pause_5h_threshold": 0.95, + // Window has NOT reset yet, so the reset guard stays inactive. + "codex_5h_reset_at": time.Now().Add(time.Hour).Format(time.RFC3339), + // Snapshot is stale: older than openAICodexAutoPauseStaleAfter (2h). + "codex_usage_updated_at": time.Now().Add(-3 * time.Hour).Format(time.RFC3339), + }, + } + secondary := Account{ID: 35702, Platform: PlatformOpenAI, Type: AccountTypeAPIKey, Status: StatusActive, Schedulable: true, Concurrency: 1, Priority: 5} + svc := &OpenAIGatewayService{accountRepo: schedulerTestOpenAIAccountRepo{accounts: []Account{primary, secondary}}, cfg: &config.Config{}} + + account, err := svc.SelectAccountForModelWithExclusions(ctx, nil, "", "gpt-5.1", nil) + require.NoError(t, err) + require.NotNil(t, account) + require.Equal(t, int64(35701), account.ID) +} + +// Issue #2994 guardrail: a genuinely-exhausted account whose snapshot was refreshed recently +// (codex_usage_updated_at fresh) must STILL be auto-paused. The stale self-heal must not let a +// real 99%-used account escape pause. +func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_FreshExhaustedSnapshotStillPauses_Issue2994(t *testing.T) { + ctx := context.Background() + primary := Account{ + ID: 35801, + Platform: PlatformOpenAI, + Type: AccountTypeAPIKey, + Status: StatusActive, + Schedulable: true, + Concurrency: 1, + Priority: 0, + Extra: map[string]any{ + "codex_5h_used_percent": 99.0, + "auto_pause_5h_threshold": 0.95, + "codex_5h_reset_at": time.Now().Add(time.Hour).Format(time.RFC3339), + // Snapshot refreshed 1 minute ago: not stale, so the account stays paused. + "codex_usage_updated_at": time.Now().Add(-time.Minute).Format(time.RFC3339), + }, + } + secondary := Account{ID: 35802, Platform: PlatformOpenAI, Type: AccountTypeAPIKey, Status: StatusActive, Schedulable: true, Concurrency: 1, Priority: 5} + svc := &OpenAIGatewayService{accountRepo: schedulerTestOpenAIAccountRepo{accounts: []Account{primary, secondary}}, cfg: &config.Config{}} + + account, err := svc.SelectAccountForModelWithExclusions(ctx, nil, "", "gpt-5.1", nil) + require.NoError(t, err) + require.NotNil(t, account) + require.Equal(t, int64(35802), account.ID) +} + func TestOpenAIGatewayService_SelectAccountForModelWithExclusions_SkipsFreshlyRateLimitedSnapshotCandidate(t *testing.T) { ctx := context.Background() groupID := int64(10102) diff --git a/backend/internal/service/openai_gateway_service.go b/backend/internal/service/openai_gateway_service.go index 17ac7fc2a01..81fcea9d0eb 100644 --- a/backend/internal/service/openai_gateway_service.go +++ b/backend/internal/service/openai_gateway_service.go @@ -59,6 +59,10 @@ const ( codexCLIVersion = "0.125.0" // Codex 限额快照仅用于后台展示/诊断,不需要每个成功请求都立即落库。 openAICodexSnapshotPersistMinInterval = 30 * time.Second + // 配额自动暂停时,超过该时长仍未刷新的 used% 快照视为陈旧,不再据此暂停账号。 + // 被暂停的账号收不到流量,其快照永远不会从上游响应头刷新;该兜底让账号在快照 + // 陈旧时放行一次请求,从而通过正常响应头自愈,而无需等待整个窗口(5h/7d)重置。 + openAICodexAutoPauseStaleAfter = 2 * time.Hour ) // OpenAI allowed headers whitelist (for non-passthrough). @@ -1484,9 +1488,36 @@ func resolveOpenAIQuotaUtilization(extra map[string]any, window string, now time if openAIQuotaWindowReset(extra, window, now) { return 0, false } + // 快照过于陈旧(账号长期未收到流量刷新)时,不再据此暂停。放行后下一次响应头 + // 会刷新快照实现自愈,避免账号在错误/过期的 used% 上被永久跳过(issue #2994)。 + if openAICodexSnapshotStaleForPause(extra, now) { + return 0, false + } return usedPercent / 100, true } +// openAICodexSnapshotStaleForPause reports whether the Codex usage snapshot is stale +// enough that it should no longer keep an account auto-paused. It anchors on +// codex_usage_updated_at (always written by buildCodexUsageExtraUpdates). A missing or +// unparseable timestamp returns false (treated as fresh, so the account stays paused) — +// this is deliberate: it prevents any snapshot without a write time from silently escaping +// auto-pause, and a genuinely-exhausted account that is actively served refreshes the +// timestamp on every response so it never crosses the staleness bound. +func openAICodexSnapshotStaleForPause(extra map[string]any, now time.Time) bool { + if len(extra) == 0 { + return false + } + updatedRaw, ok := extra["codex_usage_updated_at"] + if !ok { + return false + } + updatedAt, err := parseTime(fmt.Sprint(updatedRaw)) + if err != nil { + return false + } + return now.Sub(updatedAt) >= openAICodexAutoPauseStaleAfter +} + // openAIQuotaWindowReset reports whether the Codex usage window's reset time has // already passed relative to now. It prefers the absolute codex__reset_at // timestamp and falls back to codex__reset_after_seconds anchored at diff --git a/backend/internal/service/openai_gateway_service_codex_snapshot_test.go b/backend/internal/service/openai_gateway_service_codex_snapshot_test.go index 654dd4cabe8..27208b58d92 100644 --- a/backend/internal/service/openai_gateway_service_codex_snapshot_test.go +++ b/backend/internal/service/openai_gateway_service_codex_snapshot_test.go @@ -104,6 +104,39 @@ func TestBuildCodexUsageExtraUpdates_UsesSnapshotUpdatedAt(t *testing.T) { } } +// TestBuildCodexUsageExtraUpdates_FreshAccountUsedPercentNotInverted_Issue2994 locks in the +// canonical "used %" semantics for the 5h window. A fresh account reports a tiny +// secondary-used-percent (~1%); the stored codex_5h_used_percent must equal that value +// directly and must NOT be inverted to ~99%. Regression guard for issue #2994 / the reverted +// commit b65dde63 (PR #2918), which applied `100 - used` and made fresh accounts look +// exhausted, tripping auto-pause and excluding them from scheduling. +func TestBuildCodexUsageExtraUpdates_FreshAccountUsedPercentNotInverted_Issue2994(t *testing.T) { + secondaryUsed := 1.0 // 5h window: barely used + secondaryWindow := 300 + primaryUsed := 2.0 // 7d window: barely used + primaryWindow := 10080 + + snapshot := &OpenAICodexUsageSnapshot{ + PrimaryUsedPercent: &primaryUsed, + PrimaryWindowMinutes: &primaryWindow, + SecondaryUsedPercent: &secondaryUsed, + SecondaryWindowMinutes: &secondaryWindow, + UpdatedAt: "2026-02-16T10:00:00Z", + } + + updates := buildCodexUsageExtraUpdates(snapshot, time.Date(2026, 2, 16, 10, 0, 0, 0, time.UTC)) + if updates == nil { + t.Fatal("expected non-nil updates") + } + + if got := updates["codex_5h_used_percent"]; got != 1.0 { + t.Fatalf("codex_5h_used_percent = %v, want 1.0 (direct used%%, NOT inverted to 99)", got) + } + if got := updates["codex_7d_used_percent"]; got != 2.0 { + t.Fatalf("codex_7d_used_percent = %v, want 2.0 (direct used%%, NOT inverted to 98)", got) + } +} + func TestBuildCodexUsageExtraUpdates_FallbackToNowWhenUpdatedAtInvalid(t *testing.T) { primaryUsed := 15.0 primaryReset := 30