diff --git a/src/lib/utils/cache/remote/cached.ts b/src/lib/utils/cache/remote/cached.ts index 32a7f7f10..f2d74079c 100644 --- a/src/lib/utils/cache/remote/cached.ts +++ b/src/lib/utils/cache/remote/cached.ts @@ -3,6 +3,12 @@ import type { RedisClientType } from '../../../../routes/api/redis'; const ENABLE_CACHE_LOGS = true; +// A cache read should take single-digit milliseconds. If it takes longer the +// connection is likely degraded (e.g. a half-open socket after an idle drop), so +// we give up and serve fresh data rather than block the request. A stalled read +// once hung explore + project SSR for over a minute and took a deployment down. +const READ_TIMEOUT_MS = 1000; + function log(...content: unknown[]) { if (ENABLE_CACHE_LOGS) { // eslint-disable-next-line no-console @@ -10,8 +16,29 @@ function log(...content: unknown[]) { } } +function withTimeout(promise: Promise, ms: number): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error(`redis read timed out after ${ms}ms`)), ms); + promise.then( + (value) => { + clearTimeout(timer); + resolve(value); + }, + (err) => { + clearTimeout(timer); + reject(err); + }, + ); + }); +} + /** * Caches the result of a fetcher function using Redis. + * + * The cache read is bounded by a short timeout and never throws: if Redis is + * slow or unreachable the fetcher runs instead, so a degraded cache slows + * requests down rather than breaking them. + * * @param redis - The Redis instance. If undefined, caching is disabled. * @param key - The cache key. * @param EX - The expiration time in seconds. @@ -27,7 +54,15 @@ export default async function cached>( ) { const keyWithNetwork = `${network.name}-${key}`; - const cachedResponse = redis && (await redis.get(keyWithNetwork)); + let cachedResponse: string | null = null; + if (redis) { + try { + cachedResponse = await withTimeout(redis.get(keyWithNetwork), READ_TIMEOUT_MS); + } catch (err) { + // Degraded cache — fall through to the fetcher rather than blocking. + log('CACHE READ FAILED', { keyWithNetwork, error: (err as Error).message }); + } + } if (cachedResponse) { log('CACHE HIT', { keyWithNetwork }); @@ -38,8 +73,8 @@ export default async function cached>( const data = await fetcher(); - redis?.set(keyWithNetwork, JSON.stringify(data), { - EX, + redis?.set(keyWithNetwork, JSON.stringify(data), { EX })?.catch((err) => { + log('CACHE WRITE FAILED', { keyWithNetwork, error: (err as Error).message }); }); return data; diff --git a/src/routes/api/redis.ts b/src/routes/api/redis.ts index df6d80cc4..74ffea611 100644 --- a/src/routes/api/redis.ts +++ b/src/routes/api/redis.ts @@ -10,7 +10,17 @@ const connectionString = getOptionalEnvVar( ); export const redis = - connectionString && !building ? redisSdk.createClient({ url: connectionString }) : undefined; + connectionString && !building + ? redisSdk.createClient({ + url: connectionString, + // Proactively PING on an idle interval so a dropped or half-open socket + // is detected and reconnected. Railway closes idle internal TCP + // connections; without this, a low-traffic instance can keep sending + // commands into a dead socket that never replies, hanging every cache + // read for tens of seconds (the 2026-06-28 Filecoin app outage). + pingInterval: 10000, + }) + : undefined; export type RedisClientType = typeof redis; redis?.on('error', (err) => {