fix(cli): retry mechanism when multiple systems query the MAPI

maoberlehner · maoberlehner · commit b6e543c13af4 · 2025-10-22T09:34:35.000+02:00
When multiple systems hit the MAPI with the same access token, the retry mechanisms quickly gave up. By decreasing the semaphore batch size and increasing the max retries, we make such situations less likely. However, it is still possible to run into limits when many systems query the MAPI with the same access token in parallel. See the following article for the rational behind the full jitter retry delay: https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ Fixes WDX-184
diff --git a/packages/cli/src/commands/migrations/run/index.ts b/packages/cli/src/commands/migrations/run/index.ts
@@ -114,7 +114,7 @@ migrationsCommand.command('run [componentName]')
           query,
           starts_with: startsWith,
         },
-        batchSize: 100,
+        batchSize: 12,
         onTotal: (total) => {
           storiesProgress.setTotal(total);
           migrationsProgress.setTotal(total);
@@ -141,7 +141,7 @@ migrationsCommand.command('run [componentName]')
         space,
         publish,
         dryRun,
-        batchSize: 100,
+        batchSize: 12,
         onProgress: () => {
           updateProgress.increment();
         },
diff --git a/packages/cli/src/commands/migrations/run/streams/stories-stream.ts b/packages/cli/src/commands/migrations/run/streams/stories-stream.ts
@@ -91,9 +91,7 @@ class StoriesStream extends Transform {
       objectMode: true,
     });
 
-    this.semaphore = new Sema(this.batchSize, {
-      capacity: this.batchSize,
-    });
+    this.semaphore = new Sema(this.batchSize);
   }
 
   async _transform(chunk: Omit<Story, 'content'>, _encoding: string, callback: (error?: Error | null, data?: any) => void) {
diff --git a/packages/cli/src/commands/migrations/run/streams/update-stream.ts b/packages/cli/src/commands/migrations/run/streams/update-stream.ts
@@ -40,9 +40,7 @@ export class UpdateStream extends Writable {
       totalProcessed: 0,
     };
 
-    this.semaphore = new Sema(this.batchSize, {
-      capacity: this.batchSize,
-    });
+    this.semaphore = new Sema(this.batchSize);
   }
 
   async _write(chunk: { storyId: number; name: string | undefined; content: StoryContent; published?: boolean; unpublished_changes?: boolean }, _encoding: string, callback: (error?: Error | null) => void) {
diff --git a/packages/cli/src/utils/calculate-retry-delay.ts b/packages/cli/src/utils/calculate-retry-delay.ts
@@ -0,0 +1,20 @@
+/**
+ * Calculates the delay for the next retry attempt using exponential backoff with full jitter.
+ *
+ * @param attempt The current retry attempt number (starting from 0 for the first retry).
+ * @param baseDelay The initial delay in milliseconds (e.g., 100).
+ * @param maxDelay The maximum possible delay in milliseconds (e.g., 20000).
+ * @returns The calculated delay in milliseconds to wait before the next attempt.
+ */
+export function calculateRetryDelay(
+  attempt: number,
+  baseDelay: number = 100,
+  maxDelay: number = 20000,
+): number {
+  const exponentialBackoff = baseDelay * 2 ** attempt;
+  const cappedBackoff = Math.min(exponentialBackoff, maxDelay);
+  // Apply full jitter: a random value between 0 and the capped backoff
+  const jitter = Math.random() * cappedBackoff;
+
+  return jitter;
+}
diff --git a/packages/cli/src/utils/delay.ts b/packages/cli/src/utils/delay.ts
@@ -0,0 +1 @@
+export const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
diff --git a/packages/cli/src/utils/fetch.ts b/packages/cli/src/utils/fetch.ts
@@ -1,3 +1,6 @@
+import { delay } from './delay';
+import { calculateRetryDelay } from './calculate-retry-delay';
+
 export class FetchError extends Error {
   response: {
     status: number;
@@ -12,8 +15,6 @@ export class FetchError extends Error {
   }
 }
 
-export const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
-
 export interface FetchOptions {
   headers?: Record<string, string>;
   method?: string;
@@ -64,8 +65,7 @@ export async function customFetch<T>(url: string, options: FetchOptions = {}): P
       if (!response.ok) {
         // If we hit rate limit and have retries left
         if ((response.status === 429) && (attempt < maxRetries)) {
-          const waitTime = baseDelay * 2 ** attempt;
-          await delay(waitTime);
+          await delay(calculateRetryDelay(attempt, baseDelay));
           attempt++;
           continue;
         }
diff --git a/packages/mapi-client/README.md b/packages/mapi-client/README.md
@@ -140,15 +140,15 @@ The client includes built-in retry handling for rate limits and network errors:
 
 ```typescript
 // The client automatically handles retries with these defaults:
-// - maxRetries: 3
+// - maxRetries: 12
 // - retryDelay: 1000ms
 // - Respects retry-after headers from 429 responses
 
 const stories = await client.stories.list({ 
   path: { space_id: 123456 },
   query: { per_page: 10 }
 });
-// If rate limited, will automatically retry up to 3 times
+// If rate limited, will automatically retry up to 12 times
 ```
 
 ## Runtime Configuration
diff --git a/packages/mapi-client/src/__tests__/integration.test.ts b/packages/mapi-client/src/__tests__/integration.test.ts
@@ -103,12 +103,12 @@ describe('ManagementApiClient Integration - Per-Instance Rate Limiting', () => {
     const promise = client.spaces.list({});
     
     // Advance timers to allow all retries
-    await vi.advanceTimersByTimeAsync(5000);
+    await vi.advanceTimersByTimeAsync(14000);
     
     const result = await promise;
     
-    // Should make initial + maxRetries calls (3 retries = 4 total)
-    expect(mockFetch).toHaveBeenCalledTimes(4);
+    // Should make initial + maxRetries calls (12 retries = 13 total)
+    expect(mockFetch).toHaveBeenCalledTimes(13);
     
     // Result should contain the error since all retries failed
     expect(result).toBeDefined();
diff --git a/packages/mapi-client/src/client/client.ts b/packages/mapi-client/src/client/client.ts
@@ -9,6 +9,8 @@ import {
   mergeHeaders,
   setAuthParams,
 } from './utils';
+import { calculateRetryDelay } from "../utils/calculate-retry-delay";
+import { delay } from "../utils/delay";
 
 type ReqInit = Omit<RequestInit, 'body' | 'headers'> & {
   body?: any;
@@ -90,7 +92,7 @@ export const createClient = (config: Config): Client => {
     
     // Execute with retry logic by recreating the request for each attempt
     let response = await executeWithRetry(_fetch, url, requestInit, {
-      maxRetries: 3,
+      maxRetries: 12,
       retryDelay: 1000
     });
 
@@ -206,9 +208,8 @@ export const createClient = (config: Config): Client => {
       
       if (response.status === 429 && attempt < retryConfig.maxRetries) {
         const retryAfter = response.headers.get('retry-after');
-        const delay = retryAfter ? parseInt(retryAfter) * 1000 : retryConfig.retryDelay;
-        
-        await new Promise(resolve => setTimeout(resolve, delay));
+        const retryDelay = retryAfter ? parseInt(retryAfter) * 1000 : calculateRetryDelay(attempt, retryConfig.retryDelay);
+        await delay(retryDelay);
         
         // Use the original unconsumed request for retry
         return executeWithRetry(fetchFn, url, requestInit, retryConfig, attempt + 1);
diff --git a/packages/mapi-client/src/utils/calculate-retry-delay.ts b/packages/mapi-client/src/utils/calculate-retry-delay.ts
@@ -0,0 +1,20 @@
+/**
+ * Calculates the delay for the next retry attempt using exponential backoff with full jitter.
+ *
+ * @param attempt The current retry attempt number (starting from 0 for the first retry).
+ * @param baseDelay The initial delay in milliseconds (e.g., 100).
+ * @param maxDelay The maximum possible delay in milliseconds (e.g., 20000).
+ * @returns The calculated delay in milliseconds to wait before the next attempt.
+ */
+export function calculateRetryDelay(
+  attempt: number,
+  baseDelay: number = 100,
+  maxDelay: number = 20000,
+): number {
+  const exponentialBackoff = baseDelay * 2 ** attempt;
+  const cappedBackoff = Math.min(exponentialBackoff, maxDelay);
+  // Apply full jitter: a random value between 0 and the capped backoff
+  const jitter = Math.random() * cappedBackoff;
+
+  return jitter;
+}
diff --git a/packages/mapi-client/src/utils/delay.ts b/packages/mapi-client/src/utils/delay.ts
@@ -0,0 +1 @@
+export const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));

Original file line number	Diff line number	Diff line change
`@@ -91,9 +91,7 @@ class StoriesStream extends Transform {`
`91`	`91`	`objectMode: true,`
`92`	`92`	`});`
`93`	`93`
`94`		`- this.semaphore = new Sema(this.batchSize, {`
`95`		`- capacity: this.batchSize,`
`96`		`- });`
	`94`	`+ this.semaphore = new Sema(this.batchSize);`
`97`	`95`	`}`
`98`	`96`
`99`	`97`	`async _transform(chunk: Omit<Story, 'content'>, _encoding: string, callback: (error?: Error \| null, data?: any) => void) {`
Original file line number	Diff line number	Diff line change
`@@ -40,9 +40,7 @@ export class UpdateStream extends Writable {`
`40`	`40`	`totalProcessed: 0,`
`41`	`41`	`};`
`42`	`42`
`43`		`- this.semaphore = new Sema(this.batchSize, {`
`44`		`- capacity: this.batchSize,`
`45`		`- });`
	`43`	`+ this.semaphore = new Sema(this.batchSize);`
`46`	`44`	`}`
`47`	`45`
`48`	`46`	`async _write(chunk: { storyId: number; name: string \| undefined; content: StoryContent; published?: boolean; unpublished_changes?: boolean }, _encoding: string, callback: (error?: Error \| null) => void) {`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+export const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));`