Nether403 · Nether403 · May 12, 2026 · May 12, 2026
diff --git a/.kiro/specs/phase-8-deployment/tasks.md b/.kiro/specs/phase-8-deployment/tasks.md
diff --git a/apps/api/package.json b/apps/api/package.json
@@ -21,13 +21,16 @@
     "@stackfast/rules-engine": "workspace:*",
     "@stackfast/schemas": "workspace:*",
     "@stackfast/shared": "workspace:*",
+    "@upstash/ratelimit": "^2.0.8",
+    "@upstash/redis": "^1.38.0",
     "better-auth": "^1.1.13",
     "drizzle-orm": "^0.45.2",
     "hono": "^4.6.1",
     "zod": "^3.24.2"
   },
   "devDependencies": {
     "@types/node": "^22.10.2",
+    "fast-check": "^4.8.0",
     "tsx": "^4.19.2",
     "typescript": "^5.6.3"
   }

diff --git a/apps/api/src/app.test.ts b/apps/api/src/app.test.ts
@@ -1,5 +1,14 @@
-import { describe, expect, it } from "vitest";
+import { afterEach, describe, expect, it } from "vitest";
 import app from "./app.js";
+import { BUCKETS } from "./rate-limit/buckets.js";
+import {
+  __resetBackendForTests,
+} from "./rate-limit/index.js";
+import type {
+  RateLimitBackend,
+  RateLimitCheckArgs,
+  RateLimitDecision,
+} from "./rate-limit/types.js";
 
 describe("api", () => {
   it("returns health status", async () => {
@@ -458,4 +467,203 @@ describe("api", () => {
     });
     expect(response.headers.get("X-Request-ID")).toBe(customId);
   });
+
+  // ─── A6 rate-limit contract tests ──────────────────────────────
+  //
+  // These four cases are named verbatim in design.md § 8 "Testing strategy"
+  // and pin down the behavior the factory rewrite in
+  // `apps/api/src/rate-limit/index.ts` has to preserve. Every case is
+  // self-isolating: each one installs its own backend via
+  // `__resetBackendForTests` and restores the default in `afterEach` so a
+  // failing case cannot bleed counters into the next.
+
+  describe("rate-limit contract", () => {
+    afterEach(() => {
+      __resetBackendForTests(null);
+    });
+
+    /**
+     * Build a `RateLimitBackend` that records every `check()` call it
+     * receives into the returned `calls` array. Always returns an allow
+     * decision with full remaining quota so the admin/exempt-route checks
+     * can focus on "was this called at all?".
+     */
+    function createSpyBackend(): {
+      backend: RateLimitBackend;
+      calls: RateLimitCheckArgs[];
+    } {
+      const calls: RateLimitCheckArgs[] = [];
+      const backend: RateLimitBackend = {
+        name: "memory",
+        async check(args: RateLimitCheckArgs): Promise<RateLimitDecision> {
+          calls.push(args);
+          return {
+            allowed: true,
+            remaining: BUCKETS[args.bucket].limit,
+            limit: BUCKETS[args.bucket].limit,
+            resetAtEpochMs: Date.now() + BUCKETS[args.bucket].windowMs,
+          };
+        },
+      };
+      return { backend, calls };
+    }
+
+    /**
+     * Build a `RateLimitBackend` that delegates to a shared Map. Used by
+     * the "bucket count survives backend swap" case to prove that two
+     * wrapper instances sharing the same underlying state keep accounting
+     * consistent across a simulated restart. Semantics mirror the real
+     * memory backend: count is incremented on every call, `allowed` is
+     * `count <= limit`, and the window resets lazily at `resetAtEpochMs`.
+     */
+    function createSharedStateBackend(
+      store: Map<string, { count: number; resetAtEpochMs: number }>,
+    ): RateLimitBackend {
+      return {
+        name: "upstash",
+        async check({ bucket, clientId }): Promise<RateLimitDecision> {
+          const key = `${bucket}:${clientId}`;
+          const config = BUCKETS[bucket];
+          const now = Date.now();
+          let entry = store.get(key);
+          if (!entry || now >= entry.resetAtEpochMs) {
+            entry = { count: 1, resetAtEpochMs: now + config.windowMs };
+            store.set(key, entry);
+          } else {
+            entry.count += 1;
+          }
+          return {
+            allowed: entry.count <= config.limit,
+            remaining: Math.max(0, config.limit - entry.count),
+            limit: config.limit,
+            resetAtEpochMs: entry.resetAtEpochMs,
+          };
+        },
+      };
+    }
+
+    it("admin 401 before rate-limit counter increments (R8.1)", async () => {
+      const { backend, calls } = createSpyBackend();
+      __resetBackendForTests(backend);
+
+      const response = await app.request("/admin/tools/import", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ tools: [{ id: "whatever" }] }),
+      });
+
+      expect(response.status).toBe(401);
+      // The admin middleware rejects before any downstream middleware runs.
+      // `/admin/*` also isn't under `/api/v1/*`, so the rate-limit
+      // middleware cannot match it — the backend is never consulted.
+      expect(calls).toHaveLength(0);
+    });
+
+    it("Retry-After only on 429 (R4.7, R4.8)", async () => {
+      const store = new Map<string, { count: number; resetAtEpochMs: number }>();
+      __resetBackendForTests(createSharedStateBackend(store));
+
+      const clientId = "retry-after-test";
+      const readLimit = BUCKETS.read.limit;
+
+      // Requests 1..limit should all be 200 and MUST NOT include a
+      // Retry-After header. We collect the Retry-After values to assert
+      // they are all null in one shot.
+      const retryAfterDuringAllowed: (string | null)[] = [];
+      for (let i = 0; i < readLimit; i += 1) {
+        const ok = await app.request("/api/v1/tools/search", {
+          headers: { "x-forwarded-for": clientId },
+        });
+        expect(ok.status).toBe(200);
+        retryAfterDuringAllowed.push(ok.headers.get("Retry-After"));
+      }
+      expect(retryAfterDuringAllowed.every((v) => v === null)).toBe(true);
+
+      // Request limit+1 trips the rate limit: must return 429 and a
+      // positive-integer Retry-After header.
+      const blocked = await app.request("/api/v1/tools/search", {
+        headers: { "x-forwarded-for": clientId },
+      });
+      expect(blocked.status).toBe(429);
+      const retryAfter = blocked.headers.get("Retry-After");
+      expect(retryAfter).not.toBeNull();
+      const seconds = Number(retryAfter);
+      expect(Number.isInteger(seconds)).toBe(true);
+      expect(seconds).toBeGreaterThan(0);
+    });
+
+    it("exempt routes never counted (R4.9)", async () => {
+      const { backend, calls } = createSpyBackend();
+      __resetBackendForTests(backend);
+
+      for (let i = 0; i < 5; i += 1) {
+        const health = await app.request("/health");
+        expect(health.status).toBe(200);
+      }
+      for (let i = 0; i < 5; i += 1) {
+        const openapi = await app.request("/openapi.json");
+        expect(openapi.status).toBe(200);
+      }
+
+      expect(calls).toHaveLength(0);
+    });
+
+    it("bucket count survives backend swap (R6.4)", async () => {
+      const store = new Map<string, { count: number; resetAtEpochMs: number }>();
+      __resetBackendForTests(createSharedStateBackend(store));
+
+      const clientId = "backend-swap-test";
+      const generationLimit = BUCKETS.generation.limit;
+      const preSwap = generationLimit - 10; // 20 of the 30-quota window
+
+      // Fire `preSwap` generation-bucket requests. These exercise both
+      // rate-limit middlewares (generation + /api/v1/*), so we just look
+      // at the generation counter in the shared store.
+      for (let i = 0; i < preSwap; i += 1) {
+        const response = await app.request("/api/v1/blueprints", {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            "x-forwarded-for": clientId,
+          },
+          // Empty body intentionally: we want the rate-limit middleware
+          // to count every request, but the blueprint handler to short-
+          // circuit quickly with a 400 so the test stays fast.
+          body: "{}",
+        });
+        expect(response.status).not.toBe(429);
+      }
+      expect(
+        store.get(`generation:${clientId}`)?.count,
+      ).toBe(preSwap);
+
+      // Swap the backend wrapper but keep the SAME shared store — this
+      // simulates an API service restart where the underlying Redis
+      // keyspace survives. If accounting resets, the next 11 requests
+      // would all be 200 and the invariant breaks.
+      __resetBackendForTests(createSharedStateBackend(store));
+
+      const postSwapStatuses: number[] = [];
+      for (let i = 0; i < 11; i += 1) {
+        const response = await app.request("/api/v1/blueprints", {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+            "x-forwarded-for": clientId,
+          },
+          body: "{}",
+        });
+        postSwapStatuses.push(response.status);
+      }
+
+      // 20 pre-swap + first 10 post-swap = 30 (still allowed). Request 31
+      // (the 11th post-swap) must be 429 because the shared store tracks
+      // the cumulative count across the restart.
+      expect(postSwapStatuses.slice(0, 10).every((s) => s !== 429)).toBe(true);
+      expect(postSwapStatuses[10]).toBe(429);
+      expect(
+        store.get(`generation:${clientId}`)?.count,
+      ).toBe(generationLimit + 1);
+    });
+  });
 });
diff --git a/apps/api/src/app.ts b/apps/api/src/app.ts
@@ -18,6 +18,7 @@ import type { MiddlewareHandler } from "hono/types";
 import { z } from "zod";
 import { openApiDocument } from "./openapi.js";
 import { getAuth, requireSession, optionalSession } from "./middleware/auth.js";
+import { createRateLimitMiddleware } from "./rate-limit/index.js";
 
 type Bindings = {
   ADMIN_API_KEY?: string;
@@ -52,11 +53,6 @@ const EnrichToolSchema = z.object({
   force: z.boolean().optional(),
 });
 
-const GENERATION_LIMIT = 30;
-const READ_LIMIT = 100;
-const WINDOW_MS = 60_000;
-export const rateLimitBuckets = new Map<string, { count: number; resetAt: number }>();
-
 const catalogLoader = new CatalogLoader();
 const configuredCorsOrigin = process.env.CORS_ORIGIN ?? process.env.WEB_ORIGIN ?? "http://localhost:5173";
 
@@ -94,9 +90,12 @@ app.use("*", async (c, next) => {
 });
 
 // --- Rate limiting ---
-app.use("/api/v1/blueprints", rateLimit("generation", GENERATION_LIMIT));
-app.use("/api/v1/scaffolds", rateLimit("generation", GENERATION_LIMIT));
-app.use("/api/v1/*", rateLimit("read", READ_LIMIT));
+// `/health` and `/openapi.json` are registered as top-level routes without any
+// `/api/v1/*` prefix, so the rate-limit middleware below never matches them
+// (R4.9: exempt routes never counted).
+app.use("/api/v1/blueprints", createRateLimitMiddleware("generation"));
+app.use("/api/v1/scaffolds", createRateLimitMiddleware("generation"));
+app.use("/api/v1/*", createRateLimitMiddleware("read"));
 
 // --- Auth middleware ---
 app.use("/api/v1/blueprints", requireSession());
@@ -346,29 +345,6 @@ function resolveTools(toolIds: string[]): Tool[] {
   return tools as Tool[];
 }
 
-function rateLimit(bucket: string, limit: number): MiddlewareHandler<{ Bindings: Bindings; Variables: Variables }> {
-  return async (c, next) => {
-    const clientId = c.req.header("x-forwarded-for") ?? c.req.header("cf-connecting-ip") ?? "local";
-    const key = `${bucket}:${clientId}`;
-    const now = Date.now();
-    const current = rateLimitBuckets.get(key);
-
-    if (!current || current.resetAt <= now) {
-      rateLimitBuckets.set(key, { count: 1, resetAt: now + WINDOW_MS });
-      await next();
-      return;
-    }
-
-    if (current.count >= limit) {
-      c.header("Retry-After", String(Math.ceil((current.resetAt - now) / 1000)));
-      return c.json({ error: "Rate limit exceeded", requestId: c.get("requestId") }, 429);
-    }
-
-    current.count += 1;
-    await next();
-  };
-}
-
 function requireAdminApiKey(): MiddlewareHandler<{ Bindings: Bindings; Variables: Variables }> {
   return async (c, next) => {
     const configuredKey = c.env?.ADMIN_API_KEY ?? process.env.ADMIN_API_KEY;

diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts
@@ -1,5 +1,5 @@
 import { serve } from "@hono/node-server";
-import app, { rateLimitBuckets } from "./app.js";
+import app from "./app.js";
 
 const port = Number(process.env.PORT ?? 3000);
 
@@ -10,21 +10,4 @@ serve({
 
 console.log(`Stackfast API listening on http://localhost:${port}`);
 
-// --- Rate limit stale key cleanup (in-memory, single-process) ---
-// TODO Phase 8: Replace with Upstash/Redis-backed rate limiting for multi-instance support
-const RATE_LIMIT_CLEANUP_INTERVAL_MS = 5 * 60_000; // every 5 minutes
-setInterval(() => {
-  const now = Date.now();
-  let cleaned = 0;
-  for (const [key, bucket] of rateLimitBuckets) {
-    if (bucket.resetAt <= now) {
-      rateLimitBuckets.delete(key);
-      cleaned++;
-    }
-  }
-  if (cleaned > 0) {
-    console.log(`[rate-limit] Cleaned ${cleaned} stale bucket(s)`);
-  }
-}, RATE_LIMIT_CLEANUP_INTERVAL_MS);
-
 export default app;
diff --git a/apps/api/src/rate-limit/buckets.test.ts b/apps/api/src/rate-limit/buckets.test.ts
@@ -0,0 +1,24 @@
+import { describe, expect, it } from "vitest";
+import { BUCKETS, BUCKET_NAMES, type BucketName } from "./buckets.js";
+
+describe("rate-limit buckets", () => {
+  it("configures the generation bucket at 30 requests per 60s (R4.2)", () => {
+    expect(BUCKETS.generation).toEqual({ limit: 30, windowMs: 60_000 });
+  });
+
+  it("configures the read bucket at 100 requests per 60s (R4.3)", () => {
+    expect(BUCKETS.read).toEqual({ limit: 100, windowMs: 60_000 });
+  });
+
+  it("exposes exactly the two bucket names used by the middleware", () => {
+    expect(BUCKET_NAMES).toEqual(["generation", "read"]);
+    const keys = Object.keys(BUCKETS) as BucketName[];
+    expect(new Set(keys)).toEqual(new Set(["generation", "read"]));
+  });
+
+  it("uses a 60 second window for every bucket", () => {
+    for (const name of BUCKET_NAMES) {
+      expect(BUCKETS[name].windowMs).toBe(60_000);
+    }
+  });
+});
diff --git a/apps/api/src/rate-limit/buckets.ts b/apps/api/src/rate-limit/buckets.ts
@@ -0,0 +1,27 @@
+/**
+ * Rate-limit bucket configuration.
+ *
+ * Two buckets per R4.2 / R4.3:
+ *   - `generation` — 30 requests per 60s, applied to POST /api/v1/blueprints
+ *     and POST /api/v1/scaffolds.
+ *   - `read`       — 100 requests per 60s, applied to the remaining /api/v1/*
+ *     routes.
+ *
+ * Kept as a pure module so both backends (memory and Upstash, landing in A2
+ * and A4) and the fail-open wrapper (A3) can import it without pulling in
+ * Hono, Redis, or any other runtime dependency.
+ */
+
+export type BucketName = "generation" | "read";
+
+export interface BucketConfig {
+  readonly limit: number;
+  readonly windowMs: number;
+}
+
+export const BUCKETS: Readonly<Record<BucketName, BucketConfig>> = {
+  generation: { limit: 30, windowMs: 60_000 },
+  read: { limit: 100, windowMs: 60_000 },
+} as const;
+
+export const BUCKET_NAMES: readonly BucketName[] = ["generation", "read"] as const;