From c5941fb37d354e0a7ebf6011b255c54be04e9e2e Mon Sep 17 00:00:00 2001 From: Pranay Prakash Date: Fri, 6 Feb 2026 15:32:13 -0800 Subject: [PATCH 1/2] Fix flaky E2E tests - Widen promiseAnyWorkflow timing gaps (100ms/10s vs 1s/3s) to prevent queue scheduling jitter from causing non-deterministic winners - Reduce readableStreamWorkflow inter-chunk delay from 1s to 500ms to reduce total stream duration and waitUntil pressure - Replace fixed `sleep 10` in CI with health-check poll loop after dev.test.ts to prevent e2e tests from hitting a mid-rebuild server - Increase dev.test.ts timeouts from 30s to 60s for nitro-based frameworks that do full (non-incremental) rebuilds on CI - Add beforeAll health check in e2e.test.ts to verify server readiness Co-Authored-By: Claude Opus 4.6 --- .changeset/fix-flaky-e2e-tests.md | 5 +++++ .github/workflows/tests.yml | 10 +++++++++- packages/core/e2e/dev.test.ts | 6 +++--- packages/core/e2e/e2e.test.ts | 21 ++++++++++++++++++++- workbench/example/workflows/99_e2e.ts | 6 +++--- 5 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 .changeset/fix-flaky-e2e-tests.md diff --git a/.changeset/fix-flaky-e2e-tests.md b/.changeset/fix-flaky-e2e-tests.md new file mode 100644 index 0000000000..85fe90ba21 --- /dev/null +++ b/.changeset/fix-flaky-e2e-tests.md @@ -0,0 +1,5 @@ +--- +"@workflow/core": patch +--- + +Fix flaky E2E tests: widen promiseAny timing gaps, reduce stream chunk delay, add health checks and increase dev test timeouts diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a06288c869..a46590646e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -328,7 +328,15 @@ jobs: run: | cd workbench/${{ matrix.app.name }} && pnpm dev & echo "starting tests in 10 seconds" && sleep 10 - pnpm vitest run packages/core/e2e/dev.test.ts; sleep 10 + pnpm vitest run packages/core/e2e/dev.test.ts + echo "Waiting for server to stabilize..." + for i in $(seq 1 60); do + if curl -sf "$DEPLOYMENT_URL/.well-known/workflow/v1/manifest.json" > /dev/null 2>&1; then + echo "Server healthy after ${i}s" + break + fi + sleep 1 + done pnpm run test:e2e --reporter=default --reporter=json --outputFile=e2e-local-dev-${{ matrix.app.name }}-${{ matrix.app.canary && 'canary' || 'stable' }}.json env: NODE_OPTIONS: "--enable-source-maps" diff --git a/packages/core/e2e/dev.test.ts b/packages/core/e2e/dev.test.ts index 824c807933..2a8d7ec549 100644 --- a/packages/core/e2e/dev.test.ts +++ b/packages/core/e2e/dev.test.ts @@ -57,7 +57,7 @@ export function createDevTests(config?: DevTestConfig) { restoreFiles.length = 0; }); - test('should rebuild on workflow change', { timeout: 30_000 }, async () => { + test('should rebuild on workflow change', { timeout: 60_000 }, async () => { const workflowFile = path.join(appPath, workflowsDir, testWorkflowFile); const content = await fs.readFile(workflowFile, 'utf8'); @@ -85,7 +85,7 @@ export async function myNewWorkflow() { } }); - test('should rebuild on step change', { timeout: 30_000 }, async () => { + test('should rebuild on step change', { timeout: 60_000 }, async () => { const stepFile = path.join(appPath, workflowsDir, testWorkflowFile); const content = await fs.readFile(stepFile, 'utf8'); @@ -115,7 +115,7 @@ export async function myNewStep() { test( 'should rebuild on adding workflow file', - { timeout: 30_000 }, + { timeout: 60_000 }, async () => { const workflowFile = path.join( appPath, diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 086e61b5ee..fb66d1a4c1 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -1,7 +1,7 @@ import { withResolvers } from '@workflow/utils'; import fs from 'fs'; import path from 'path'; -import { afterAll, assert, describe, expect, test } from 'vitest'; +import { afterAll, assert, beforeAll, describe, expect, test } from 'vitest'; import { dehydrateWorkflowArguments } from '../src/serialization'; import { cliHealthJson, @@ -136,6 +136,25 @@ async function getWorkflowReturnValue(runId: string) { // NOTE: Temporarily disabling concurrent tests to avoid flakiness. // TODO: Re-enable concurrent tests after conf when we have more time to investigate. describe('e2e', () => { + // Wait for the deployment to be healthy before running tests + beforeAll(async () => { + for (let i = 1; i <= 60; i++) { + try { + const res = await fetch(deploymentUrl); + if (res.ok) { + console.log(`Server healthy after ${i}s`); + return; + } + } catch { + // Server not ready yet + } + await new Promise((resolve) => setTimeout(resolve, 1_000)); + } + throw new Error( + `Server at ${deploymentUrl} did not become healthy within 60s` + ); + }, 60_000); + // Write E2E metadata file with runIds for observability links afterAll(() => { writeE2EMetadata(); diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 0c2f362406..a5299fbc95 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -78,8 +78,8 @@ export async function promiseAnyWorkflow() { 'use workflow'; const winner = await Promise.any([ stepThatFails(), - specificDelay(1000, 'b'), // "b" should always win - specificDelay(3000, 'c'), + specificDelay(100, 'b'), // "b" should always win + specificDelay(10000, 'c'), ]); return winner; } @@ -96,7 +96,7 @@ async function genReadableStream() { for (let i = 0; i < 10; i++) { console.log('enqueueing', i); controller.enqueue(encoder.encode(`${i}\n`)); - await new Promise((resolve) => setTimeout(resolve, 1000)); + await new Promise((resolve) => setTimeout(resolve, 500)); } console.log('closing controller'); controller.close(); From fa30b640e39a6934569982c9a12ded66cd4079fd Mon Sep 17 00:00:00 2001 From: Pranay Prakash Date: Fri, 6 Feb 2026 15:50:17 -0800 Subject: [PATCH 2/2] Fix health check to use manifest endpoint with bypass headers - e2e.test.ts beforeAll: poll manifest endpoint with getProtectionBypassHeaders() and AbortController timeout instead of bare fetch(deploymentUrl) which fails with Deployment Protection - tests.yml: add health_ok flag + exit 1 fallback and curl --max-time 5 - Also replace sleep 10 in local-prod and local-postgres jobs with the same health check poll pattern Co-Authored-By: Claude Opus 4.6 --- .github/workflows/tests.yml | 38 ++++++++++++++++++++++++++++++++--- packages/core/e2e/e2e.test.ts | 12 ++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a46590646e..a27da270d2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -330,13 +330,19 @@ jobs: echo "starting tests in 10 seconds" && sleep 10 pnpm vitest run packages/core/e2e/dev.test.ts echo "Waiting for server to stabilize..." + health_ok=false for i in $(seq 1 60); do - if curl -sf "$DEPLOYMENT_URL/.well-known/workflow/v1/manifest.json" > /dev/null 2>&1; then + if curl -sf --max-time 5 "$DEPLOYMENT_URL/.well-known/workflow/v1/manifest.json" > /dev/null 2>&1; then echo "Server healthy after ${i}s" + health_ok=true break fi sleep 1 done + if [ "$health_ok" != "true" ]; then + echo "Server did not become healthy within 60s" + exit 1 + fi pnpm run test:e2e --reporter=default --reporter=json --outputFile=e2e-local-dev-${{ matrix.app.name }}-${{ matrix.app.canary && 'canary' || 'stable' }}.json env: NODE_OPTIONS: "--enable-source-maps" @@ -403,7 +409,20 @@ jobs: - name: Run E2E Tests run: | cd workbench/${{ matrix.app.name }} && pnpm start & - echo "starting tests in 10 seconds" && sleep 10 + echo "Waiting for server to be ready..." + health_ok=false + for i in $(seq 1 60); do + if curl -sf --max-time 5 "$DEPLOYMENT_URL/.well-known/workflow/v1/manifest.json" > /dev/null 2>&1; then + echo "Server healthy after ${i}s" + health_ok=true + break + fi + sleep 1 + done + if [ "$health_ok" != "true" ]; then + echo "Server did not become healthy within 60s" + exit 1 + fi pnpm run test:e2e --reporter=default --reporter=json --outputFile=e2e-local-prod-${{ matrix.app.name }}-${{ matrix.app.canary && 'canary' || 'stable' }}.json env: NODE_OPTIONS: "--enable-source-maps" @@ -489,7 +508,20 @@ jobs: - name: Run E2E Tests run: | cd workbench/${{ matrix.app.name }} && pnpm start & - echo "starting tests in 10 seconds" && sleep 10 + echo "Waiting for server to be ready..." + health_ok=false + for i in $(seq 1 60); do + if curl -sf --max-time 5 "$DEPLOYMENT_URL/.well-known/workflow/v1/manifest.json" > /dev/null 2>&1; then + echo "Server healthy after ${i}s" + health_ok=true + break + fi + sleep 1 + done + if [ "$health_ok" != "true" ]; then + echo "Server did not become healthy within 60s" + exit 1 + fi pnpm run test:e2e --reporter=default --reporter=json --outputFile=e2e-local-postgres-${{ matrix.app.name }}-${{ matrix.app.canary && 'canary' || 'stable' }}.json env: NODE_OPTIONS: "--enable-source-maps" diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index fb66d1a4c1..d16db3643c 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -138,9 +138,19 @@ async function getWorkflowReturnValue(runId: string) { describe('e2e', () => { // Wait for the deployment to be healthy before running tests beforeAll(async () => { + const manifestUrl = new URL( + '/.well-known/workflow/v1/manifest.json', + deploymentUrl + ); for (let i = 1; i <= 60; i++) { try { - const res = await fetch(deploymentUrl); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 5_000); + const res = await fetch(manifestUrl, { + headers: getProtectionBypassHeaders(), + signal: controller.signal, + }); + clearTimeout(timeout); if (res.ok) { console.log(`Server healthy after ${i}s`); return;