diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index b3987d0..c84bea6 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -60,6 +60,10 @@ ELEVENLABS_API_KEY=your-elevenlabs-api-key GOOGLE_APPLICATION_CREDENTIALS=path/to/google-credentials.json # or GOOGLE_SA_PATH=path/to/google-credentials.json + +# CereVoice TTS +CEREVOICE_EMAIL=your-cerevoice-email +CEREVOICE_PASSWORD=your-cerevoice-password ``` ## Code Style diff --git a/docs/RELEASING.md b/docs/RELEASING.md index 0249d88..a5ed41c 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -74,6 +74,8 @@ The following secrets need to be configured in the GitHub repository for the aut - `MICROSOFT_REGION`: Azure TTS region (optional, for testing) - `ELEVENLABS_API_KEY`: ElevenLabs API key (optional, for testing) - `GOOGLE_SA_KEY`: Google Cloud service account key in JSON format (optional, for testing) +- `CEREVOICE_EMAIL`: CereVoice Cloud account email (optional, for live testing) +- `CEREVOICE_PASSWORD`: CereVoice Cloud account password (optional, for live testing) ## Versioning diff --git a/docs/TESTING.md b/docs/TESTING.md index 7747a04..875cc42 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -24,6 +24,15 @@ GOOGLE_APPLICATION_CREDENTIALS=path_to_your_google_credentials_json GOOGLE_SA_PATH=path_to_your_google_service_account_json ``` +### CereVoice TTS +``` +CEREVOICE_EMAIL=your_cerevoice_email +CEREVOICE_PASSWORD=your_cerevoice_password +# Optional +CEREVOICE_ACCESS_TOKEN=existing_access_token +CEREVOICE_REFRESH_TOKEN=existing_refresh_token +``` + You can set these environment variables in a `.env` file in the root of the project. ## Running Tests @@ -57,10 +66,15 @@ npm run test:elevenlabs # Test Google TTS npm run test:google + +# Test CereVoice unit coverage +npm run test:cerevoice ``` If your credentials for the specified engine are invalid, the tests will be skipped with a clear message. +`npm run test:cerevoice` runs mocked unit coverage for auth, voice mapping, synthesis request shape, metadata fetching, and word-boundary conversion. It does not require live CereVoice credentials. + ## Running Examples The project includes a unified example framework that can demonstrate all TTS engines or specific engines. diff --git a/examples/README.md b/examples/README.md index fdb7895..55d41ec 100644 --- a/examples/README.md +++ b/examples/README.md @@ -108,6 +108,7 @@ The following engines work in browser environments: - **Azure TTS** - Requires subscription key + region - **Google Cloud TTS** - Requires service account JSON - **AWS Polly** - Requires access key + secret + region +- **CereVoice Cloud** - Requires email + password, supports SSML and word-boundary metadata - **Wit.ai TTS** - Requires API token - **Watson TTS** - Requires API key + URL diff --git a/package.json b/package.json index 8723f61..8433521 100644 --- a/package.json +++ b/package.json @@ -58,6 +58,7 @@ "test:env": "node load-env.js && jest", "test:tts": "node run-tts-tests.cjs", "test:azure": "node run-tts-tests.cjs azure", + "test:cerevoice": "cross-env NODE_OPTIONS=--experimental-vm-modules jest src/__tests__/cerevoice.test.ts", "test:elevenlabs": "node run-tts-tests.cjs elevenlabs", "test:google": "node run-tts-tests.cjs google", "test:gemini": "node run-tts-tests.cjs gemini", @@ -119,6 +120,8 @@ "tts", "text-to-speech", "azure", + "cerevoice", + "cereproc", "google", "gemini", "polly", @@ -251,6 +254,7 @@ "@google-cloud/text-to-speech": "^6.4.0" }, "gemini": {}, + "cerevoice": {}, "elevenlabs": { "@elevenlabs/elevenlabs-js": "^2.32.0" }, diff --git a/src/__tests__/cerevoice.test.ts b/src/__tests__/cerevoice.test.ts new file mode 100644 index 0000000..fdc785b --- /dev/null +++ b/src/__tests__/cerevoice.test.ts @@ -0,0 +1,345 @@ +import { afterEach, describe, expect, it, jest } from "@jest/globals"; +import { CereVoiceTTSClient } from "../engines/cerevoice"; +import { createBrowserTTSClient } from "../factory-browser"; +import { createTTSClient } from "../factory"; + +const originalFetch = globalThis.fetch; + +function arrayBufferFromBytes(bytes: Uint8Array): ArrayBuffer { + return bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength); +} + +function response( + body: any, + init: { + ok?: boolean; + status?: number; + statusText?: string; + headers?: Headers; + streamBody?: ReadableStream | null; + bytes?: Uint8Array; + } = {} +) { + const bytes = init.bytes || new Uint8Array([1, 2, 3, 4]); + return { + ok: init.ok ?? true, + status: init.status ?? 200, + statusText: init.statusText ?? "OK", + headers: init.headers || new Headers(), + body: init.streamBody ?? null, + json: async () => body, + text: async () => (typeof body === "string" ? body : JSON.stringify(body)), + arrayBuffer: async () => arrayBufferFromBytes(bytes), + }; +} + +function authResponse(accessToken = "access-token", refreshToken = "refresh-token") { + return response({ + access_token: accessToken, + refresh_token: refreshToken, + }); +} + +function voicesResponse() { + return response({ + voices: [ + { + name: "Heather", + sample_rate: [16000], + language_iso: "en", + country_iso: "GB", + accent_code: "sc", + gender: "female", + language_ms: "809", + country: "Great Britain", + region: "Scotland", + accent: "Scottish", + language: "English", + }, + ], + }); +} + +describe("CereVoiceTTSClient", () => { + afterEach(() => { + globalThis.fetch = originalFetch; + jest.restoreAllMocks(); + }); + + it("initializes with defaults and required credentials", () => { + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + + expect(client.getProperty("voice")).toBe("Heather"); + expect(client.getProperty("audioFormat")).toBe("wav"); + expect(client.capabilities.browserSupported).toBe(true); + expect(client.capabilities.nodeSupported).toBe(true); + expect((client as any).getRequiredCredentials()).toEqual(["email", "password"]); + expect(client.getModels()[0].features).toContain("word-boundary-events"); + }); + + it("applies properties from credentials", () => { + const client = new CereVoiceTTSClient({ + email: "user@example.com", + password: "secret", + properties: { + voice: "Sarah", + audioFormat: "mp3", + sampleRate: 16000, + language: "en", + accent: "rp", + metadata: true, + }, + }); + + expect(client.getProperty("voice")).toBe("Sarah"); + expect(client.getProperty("audioFormat")).toBe("mp3"); + expect(client.getProperty("sampleRate")).toBe(16000); + expect(client.getProperty("language")).toBe("en"); + expect(client.getProperty("accent")).toBe("rp"); + expect(client.getProperty("metadata")).toBe(true); + }); + + it("applies JSON properties from credentials", () => { + const client = new CereVoiceTTSClient({ + email: "user@example.com", + password: "secret", + propertiesJson: JSON.stringify({ voice: "William", audioFormat: "ogg" }), + }); + + expect(client.getProperty("voice")).toBe("William"); + expect(client.getProperty("audioFormat")).toBe("ogg"); + }); + + it("creates via node and browser factories", () => { + expect(createTTSClient("cerevoice", { email: "u", password: "p" })).toBeInstanceOf( + CereVoiceTTSClient + ); + expect(createBrowserTTSClient("cerevoice", { email: "u", password: "p" })).toBeInstanceOf( + CereVoiceTTSClient + ); + }); + + it("returns false for checkCredentials without credentials", async () => { + await expect(new CereVoiceTTSClient({}).checkCredentials()).resolves.toBe(false); + }); + + it("authenticates and maps voices", async () => { + const fetchMock = jest.fn(async (url: string, options: any) => { + if (url.endsWith("/auth")) { + expect(options.headers.Authorization).toBe( + `Basic ${Buffer.from("user@example.com:secret").toString("base64")}` + ); + return authResponse(); + } + + if (url.endsWith("/voices")) { + expect(options.headers.Authorization).toBe("Bearer access-token"); + return voicesResponse(); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + const voices = await client.getVoices(); + + expect(voices).toHaveLength(1); + expect(voices[0].id).toBe("Heather"); + expect(voices[0].provider).toBe("cerevoice"); + expect(voices[0].gender).toBe("Female"); + expect(voices[0].languageCodes[0].bcp47).toBe("en-GB"); + expect(voices[0].metadata?.accent_code).toBe("sc"); + }); + + it("synthesizes plain text with selected query params", async () => { + const audioBytes = new Uint8Array([9, 8, 7, 6]); + const fetchMock = jest.fn(async (url: string, options: any) => { + if (url.endsWith("/auth")) { + return authResponse(); + } + + if (url.includes("/speak")) { + const requestUrl = new URL(url); + expect(requestUrl.searchParams.get("voice")).toBe("Sarah"); + expect(requestUrl.searchParams.get("audio_format")).toBe("mp3"); + expect(requestUrl.searchParams.get("sample_rate")).toBe("16000"); + expect(requestUrl.searchParams.get("metadata")).toBe("false"); + expect(options.headers.Authorization).toBe("Bearer access-token"); + expect(options.headers["Content-Type"]).toBe("text/plain"); + expect(options.headers.Accept).toBe("audio/mpeg"); + expect(options.body).toBe("Hello world"); + return response({}, { bytes: audioBytes }); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + const bytes = await client.synthToBytes("Hello world", { + voice: "Sarah", + format: "mp3", + sampleRate: 16000, + }); + + expect(Array.from(bytes)).toEqual([9, 8, 7, 6]); + }); + + it("preserves SSML and sends XML content", async () => { + const fetchMock = jest.fn(async (url: string, options: any) => { + if (url.endsWith("/auth")) { + return authResponse(); + } + + if (url.includes("/speak")) { + expect(options.headers["Content-Type"]).toBe("text/xml"); + expect(options.body).toBe("Hello world"); + return response({}); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + await client.synthToBytes("Hello world"); + }); + + it("fetches CereVoice metadata and converts it to wrapper word boundaries", async () => { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(new Uint8Array([1, 2, 3])); + controller.close(); + }, + }); + const metadataHeaders = new Headers({ + "X-CereVoice-Metadata": "https://metadata.example.test/trans.xml", + }); + const fetchMock = jest.fn(async (url: string, options: any) => { + if (url.endsWith("/auth")) { + return authResponse(); + } + + if (url.includes("/speak")) { + expect(new URL(url).searchParams.get("metadata")).toBe("true"); + return response({}, { headers: metadataHeaders, streamBody: stream }); + } + + if (url === "https://metadata.example.test/trans.xml") { + expect(options.headers.Authorization).toBeUndefined(); + return response( + '' + ); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + const result = await client.synthToBytestream("Hello world", { useWordBoundary: true }); + + expect(result.wordBoundaries).toEqual([ + { text: "hello", offset: 1000, duration: 2000 }, + { text: "world", offset: 3000, duration: 3500 }, + ]); + expect(result.audioStream).toBe(stream); + }); + + it("returns empty boundaries when metadata is malformed", async () => { + const fetchMock = jest.fn(async (url: string) => { + if (url.endsWith("/auth")) { + return authResponse(); + } + + if (url.includes("/speak")) { + return response( + {}, + { + headers: new Headers({ + "X-CereVoice-Metadata": "https://metadata.example.test/bad.xml", + }), + } + ); + } + + if (url === "https://metadata.example.test/bad.xml") { + return response(''); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + const result = await client.synthToBytestream("Hello world", { useWordBoundary: true }); + + expect(result.wordBoundaries).toEqual([]); + }); + + it("fills zero-duration CereVoice word metadata from the next word offset", async () => { + const fetchMock = jest.fn(async (url: string) => { + if (url.endsWith("/auth")) { + return authResponse(); + } + + if (url.includes("/speak")) { + return response( + {}, + { + headers: new Headers({ + "X-CereVoice-Metadata": "https://metadata.example.test/zero-duration.xml", + }), + } + ); + } + + if (url === "https://metadata.example.test/zero-duration.xml") { + return response( + '' + ); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ email: "user@example.com", password: "secret" }); + const result = await client.synthToBytestream("Hello world", { useWordBoundary: true }); + + expect(result.wordBoundaries).toEqual([ + { text: "hello", offset: 300, duration: 3900 }, + { text: "world", offset: 4200, duration: 5000 }, + ]); + }); + + it("refreshes and retries once after a 401", async () => { + const fetchMock = jest.fn(async (url: string, options: any) => { + if (url.includes("/speak") && options.headers.Authorization === "Bearer stale-token") { + return response("expired", { ok: false, status: 401, statusText: "Unauthorized" }); + } + + if (url.includes("/auth/refresh")) { + expect(new URL(url).searchParams.get("refresh_token")).toBe("refresh-token"); + return response({ access_token: "new-token" }); + } + + if (url.includes("/speak") && options.headers.Authorization === "Bearer new-token") { + return response({}, { bytes: new Uint8Array([5, 5]) }); + } + + throw new Error(`Unexpected URL: ${url}`); + }); + globalThis.fetch = fetchMock as any; + + const client = new CereVoiceTTSClient({ + accessToken: "stale-token", + refreshToken: "refresh-token", + }); + const bytes = await client.synthToBytes("Hello"); + + expect(Array.from(bytes)).toEqual([5, 5]); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); +}); diff --git a/src/browser.ts b/src/browser.ts index cfc1b2d..5368f36 100644 --- a/src/browser.ts +++ b/src/browser.ts @@ -9,6 +9,7 @@ export { AbstractTTSClient } from "./core/abstract-tts"; // Browser-compatible engines export { AzureTTSClient } from "./engines/azure"; export { CartesiaTTSClient } from "./engines/cartesia"; +export { CereVoiceTTSClient } from "./engines/cerevoice"; export { DeepgramTTSClient } from "./engines/deepgram"; export { ElevenLabsTTSClient } from "./engines/elevenlabs"; export { EspeakBrowserTTSClient } from "./engines/espeak-wasm"; diff --git a/src/core/ssml-compatibility.ts b/src/core/ssml-compatibility.ts index e00efc3..a201cb9 100644 --- a/src/core/ssml-compatibility.ts +++ b/src/core/ssml-compatibility.ts @@ -87,6 +87,30 @@ export const ENGINE_SSML_CAPABILITIES: Record = { requiresNamespace: false, requiresVersion: false, }, + cerevoice: { + supportsSSML: true, + supportLevel: "full", + supportedTags: [ + "speak", + "audio", + "break", + "emphasis", + "lexicon", + "mark", + "meta", + "metadata", + "p", + "phoneme", + "prosody", + "say-as", + "sub", + "s", + "voice", + ], + unsupportedTags: ["lang"], + requiresNamespace: false, + requiresVersion: false, + }, // Partial SSML Support azure: { diff --git a/src/engines/cerevoice.ts b/src/engines/cerevoice.ts new file mode 100644 index 0000000..2ab293f --- /dev/null +++ b/src/engines/cerevoice.ts @@ -0,0 +1,766 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; + +export interface CereVoiceTTSCredentials extends TTSCredentials { + email?: string; + password?: string; + accessToken?: string; + refreshToken?: string; + baseURL?: string; + voice?: string; + sampleRate?: number; + audioFormat?: "wav" | "mp3" | "ogg"; + properties?: Record | string; + propertiesJson?: string; +} + +export interface CereVoiceTTSOptions extends SpeakOptions { + voice?: string; + audioFormat?: "wav" | "mp3" | "ogg"; + sampleRate?: number; + language?: string; + accent?: string; + metadata?: boolean; + providerOptions?: Record; +} + +type CereVoiceVoice = { + name?: string; + sample_rate?: number[]; + language_iso?: string; + country_iso?: string; + accent_code?: string; + gender?: string; + language_ms?: string; + country?: string; + region?: string; + accent?: string; + language?: string; +}; + +type WordBoundaryResult = Array<{ text: string; offset: number; duration: number }>; + +const TOKEN_LIFETIME_MS = 3 * 60 * 60 * 1000; +const TOKEN_EXPIRY_BUFFER_MS = 60 * 1000; +const SUPPORTED_AUDIO_FORMATS = new Set(["wav", "mp3", "ogg"]); + +export class CereVoiceTTSClient extends AbstractTTSClient { + private email: string; + private password: string; + private accessToken: string; + private refreshToken: string; + private baseUrl: string; + private audioFormat: "wav" | "mp3" | "ogg"; + private outputSampleRate?: number; + private language?: string; + private accent?: string; + private metadata = false; + private tokenExpiresAt = 0; + + constructor(credentials: CereVoiceTTSCredentials = {}) { + super(credentials); + + this.email = + credentials.email || + (typeof process !== "undefined" ? process.env.CEREVOICE_EMAIL || "" : ""); + this.password = + credentials.password || + (typeof process !== "undefined" ? process.env.CEREVOICE_PASSWORD || "" : ""); + this.accessToken = + credentials.accessToken || + (typeof process !== "undefined" ? process.env.CEREVOICE_ACCESS_TOKEN || "" : ""); + this.refreshToken = + credentials.refreshToken || + (typeof process !== "undefined" ? process.env.CEREVOICE_REFRESH_TOKEN || "" : ""); + this.baseUrl = (credentials.baseURL || "https://api.cerevoice.com/v2").replace(/\/+$/, ""); + this.voiceId = credentials.voice || "Heather"; + this.audioFormat = credentials.audioFormat || "wav"; + this.outputSampleRate = credentials.sampleRate; + if (this.outputSampleRate) { + this.sampleRate = this.outputSampleRate; + } + + this.capabilities = { + browserSupported: true, + nodeSupported: true, + needsWasm: false, + }; + this._models = [ + { id: "cerevoice-cloud-v2", features: ["streaming", "ssml", "word-boundary-events"] }, + ]; + + if (this.accessToken) { + this.tokenExpiresAt = Number.POSITIVE_INFINITY; + } + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: CereVoiceTTSCredentials): void { + const rawProps = + credentials.properties ?? + credentials.propertiesJson ?? + (credentials as Record).propertiesJSON; + + if (!rawProps) { + return; + } + + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps) as Record; + } catch { + parsed = null; + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + + if (!parsed) { + return; + } + + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value as any); + } + } + + setVoice(voiceId: string, lang?: string): void { + this.voiceId = voiceId; + if (lang) { + this.lang = lang; + } + } + + getProperty(property: string): any { + switch (property) { + case "voice": + return this.voiceId; + case "baseURL": + return this.baseUrl; + case "audioFormat": + return this.audioFormat; + case "sampleRate": + return this.outputSampleRate; + case "language": + return this.language; + case "accent": + return this.accent; + case "metadata": + return this.metadata; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "voice": + this.setVoice(String(value)); + break; + case "baseURL": + case "baseUrl": + this.baseUrl = String(value).replace(/\/+$/, ""); + break; + case "audioFormat": + if (this.isSupportedAudioFormat(value)) { + this.audioFormat = value; + } + break; + case "sampleRate": { + const sampleRate = Number(value); + if (Number.isFinite(sampleRate) && sampleRate > 0) { + this.outputSampleRate = sampleRate; + this.sampleRate = sampleRate; + } + break; + } + case "language": + this.language = String(value); + break; + case "accent": + this.accent = String(value); + break; + case "metadata": + this.metadata = Boolean(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.accessToken && !this.refreshToken && (!this.email || !this.password)) { + return false; + } + + try { + const voices = await this._getVoices(); + return voices.length > 0; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["email", "password"]; + } + + protected async _getVoices(): Promise { + try { + const response = await this.fetchWithAuth(this.buildUrl("/voices")); + if (!response.ok) { + return []; + } + + const data = (await response.json()) as { voices?: CereVoiceVoice[] }; + return Array.isArray(data.voices) ? data.voices : []; + } catch { + return []; + } + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return (rawVoices as CereVoiceVoice[]).map((voice) => { + const language = voice.language_iso || "en"; + const country = voice.country_iso || undefined; + const bcp47 = country ? `${language.toLowerCase()}-${country.toUpperCase()}` : language; + + return { + id: voice.name || "unknown", + name: voice.name || "Unknown", + gender: this.mapGender(voice.gender), + provider: "cerevoice", + languageCodes: [ + { + bcp47, + iso639_3: toIso639_3(bcp47), + display: toLanguageDisplay(bcp47), + }, + ], + metadata: { + sample_rate: voice.sample_rate, + accent_code: voice.accent_code, + accent: voice.accent, + country: voice.country, + region: voice.region, + language_iso: voice.language_iso, + country_iso: voice.country_iso, + language_ms: voice.language_ms, + language: voice.language, + }, + }; + }); + } + + async synthToBytes(text: string, options: CereVoiceTTSOptions = {}): Promise { + const prepared = await this.prepareInput(text, options); + const wantsMetadata = this.shouldRequestMetadata(options); + const response = await this.requestSynthesis(prepared, options, wantsMetadata); + const audioBytes = new Uint8Array(await response.arrayBuffer()); + + if (wantsMetadata) { + const wordBoundaries = await this.getWordBoundariesFromResponse(response); + if (wordBoundaries.length > 0) { + this.timings = wordBoundaries.map((wb) => [ + wb.offset / 10000, + (wb.offset + wb.duration) / 10000, + wb.text, + ]); + } + } else { + this._createEstimatedWordTimings(prepared.plainText); + } + + return audioBytes; + } + + async synthToBytestream( + text: string, + options: CereVoiceTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: WordBoundaryResult; + }> { + const prepared = await this.prepareInput(text, options); + const wantsMetadata = this.shouldRequestMetadata(options); + const response = await this.requestSynthesis(prepared, options, wantsMetadata); + const wordBoundaries = wantsMetadata ? await this.getWordBoundariesFromResponse(response) : []; + + if (wordBoundaries.length > 0) { + this.timings = wordBoundaries.map((wb) => [ + wb.offset / 10000, + (wb.offset + wb.duration) / 10000, + wb.text, + ]); + } + + if (response.body) { + return { + audioStream: response.body, + wordBoundaries, + }; + } + + const audioBytes = new Uint8Array(await response.arrayBuffer()); + const audioStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioBytes); + controller.close(); + }, + }); + + return { + audioStream, + wordBoundaries, + }; + } + + private async requestSynthesis( + prepared: { body: string; contentType: "text/plain" | "text/xml" }, + options: CereVoiceTTSOptions, + metadata: boolean + ) { + const audioFormat = this.resolveAudioFormat(options); + const providerOptions = options.providerOptions || {}; + const url = this.buildUrl("/speak", { + voice: options.voice || this.voiceId || undefined, + audio_format: audioFormat, + sample_rate: options.sampleRate || this.outputSampleRate, + language: options.language || this.language, + accent: options.accent || this.accent, + metadata, + ...providerOptions, + }); + + const response = await this.fetchWithAuth(url, { + method: "POST", + headers: { + Accept: this.acceptHeaderForFormat(audioFormat), + "Content-Type": prepared.contentType, + }, + body: prepared.body, + }); + + if (!response.ok) { + const errorText = await this.safeReadErrorText(response); + throw new Error( + `CereVoice API error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}` + ); + } + + return response; + } + + private async prepareInput( + text: string, + options: CereVoiceTTSOptions + ): Promise<{ body: string; contentType: "text/plain" | "text/xml"; plainText: string }> { + let processedText = text; + + if (options.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + processedText = await SpeechMarkdown.toSSML(processedText, "w3c"); + } + + if (options.rawSSML || this.isXmlLike(processedText)) { + const body = + options.rawSSML && !this.isXmlLike(processedText) + ? SSMLUtils.wrapWithSpeakTags(this.escapeXml(processedText)) + : processedText; + return { + body, + contentType: "text/xml", + plainText: SSMLUtils.stripSSML(body), + }; + } + + if (this.shouldApplyProsody(options)) { + const attrs: string[] = []; + const rate = options.rate ?? this.properties.rate; + const pitch = options.pitch ?? this.properties.pitch; + const volume = options.volume ?? this.properties.volume; + + if (rate && rate !== "medium") { + attrs.push(`rate="${rate}"`); + } + if (pitch && pitch !== "medium") { + attrs.push(`pitch="${pitch}"`); + } + if (volume !== undefined && volume !== 100) { + attrs.push(`volume="${volume}"`); + } + + const escapedText = this.escapeXml(processedText); + const body = + attrs.length > 0 + ? `${escapedText}` + : `${escapedText}`; + + return { + body, + contentType: "text/xml", + plainText: processedText, + }; + } + + return { + body: processedText, + contentType: "text/plain", + plainText: processedText, + }; + } + + private shouldApplyProsody(options: SpeakOptions): boolean { + return ( + options.rate !== undefined || + options.pitch !== undefined || + options.volume !== undefined || + this.properties.rate !== "medium" || + this.properties.pitch !== "medium" || + this.properties.volume !== 100 + ); + } + + private shouldRequestMetadata(options: CereVoiceTTSOptions): boolean { + return Boolean(options.useWordBoundary || options.metadata || this.metadata); + } + + private async getWordBoundariesFromResponse(response: { + headers?: Headers; + }): Promise { + const metadataUrl = this.getHeader(response.headers, "X-CereVoice-Metadata"); + if (!metadataUrl) { + return []; + } + + try { + const metadataResponse = await getFetch()(metadataUrl, { + method: "GET", + headers: { + Accept: "text/xml, application/xml, text/plain", + }, + }); + + if (!metadataResponse.ok) { + return []; + } + + return this.parseMetadataXml(await metadataResponse.text()); + } catch { + return []; + } + } + + private parseMetadataXml(xml: string): WordBoundaryResult { + if (!xml.trim()) { + return []; + } + + if (typeof DOMParser !== "undefined") { + try { + const document = new DOMParser().parseFromString(xml, "application/xml"); + const words = Array.from(document.getElementsByTagName("word")); + const parsed = words + .map((word) => + this.createWordBoundary( + word.getAttribute("name"), + word.getAttribute("start"), + word.getAttribute("end") + ) + ) + .filter((word): word is { text: string; offset: number; duration: number } => + Boolean(word) + ); + + if (parsed.length > 0) { + return this.fillMissingDurations(parsed); + } + } catch { + return []; + } + } + + const wordBoundaries: WordBoundaryResult = []; + const wordTagRegex = /]*)\/?>/gi; + let wordMatch: RegExpExecArray | null = wordTagRegex.exec(xml); + + while (wordMatch !== null) { + const attributes = this.parseXmlAttributes(wordMatch[1]); + const boundary = this.createWordBoundary(attributes.name, attributes.start, attributes.end); + if (boundary) { + wordBoundaries.push(boundary); + } + wordMatch = wordTagRegex.exec(xml); + } + + return this.fillMissingDurations(wordBoundaries); + } + + private fillMissingDurations(wordBoundaries: WordBoundaryResult): WordBoundaryResult { + return wordBoundaries.map((boundary, index) => { + if (boundary.duration > 0) { + return boundary; + } + + const next = wordBoundaries[index + 1]; + const fallbackDuration = next ? Math.max(next.offset - boundary.offset, 0) : 5000; + + return { + ...boundary, + duration: fallbackDuration, + }; + }); + } + + private parseXmlAttributes(attributeText: string): Record { + const attributes: Record = {}; + const attrRegex = /([A-Za-z_:][\w:.-]*)\s*=\s*(?:"([^"]*)"|'([^']*)')/g; + let attrMatch: RegExpExecArray | null = attrRegex.exec(attributeText); + + while (attrMatch !== null) { + attributes[attrMatch[1]] = this.decodeXmlEntities(attrMatch[2] ?? attrMatch[3] ?? ""); + attrMatch = attrRegex.exec(attributeText); + } + + return attributes; + } + + private createWordBoundary( + name: string | null | undefined, + start: string | null | undefined, + end: string | null | undefined + ): { text: string; offset: number; duration: number } | null { + if (!name || start === undefined || start === null || end === undefined || end === null) { + return null; + } + + const startSeconds = Number(start); + const endSeconds = Number(end); + if ( + !Number.isFinite(startSeconds) || + !Number.isFinite(endSeconds) || + endSeconds < startSeconds + ) { + return null; + } + + return { + text: name, + offset: Math.round(startSeconds * 10000), + duration: Math.round((endSeconds - startSeconds) * 10000), + }; + } + + private async fetchWithAuth( + url: string, + options: { + method?: string; + headers?: Record; + body?: string | ArrayBuffer | Uint8Array; + } = {}, + retry = true + ) { + const token = await this.ensureAccessToken(); + const response = await getFetch()(url, { + ...options, + headers: { + ...(options.headers || {}), + Authorization: `Bearer ${token}`, + }, + }); + + if (response.status === 401 && retry) { + const refreshedToken = await this.ensureAccessToken(true); + return getFetch()(url, { + ...options, + headers: { + ...(options.headers || {}), + Authorization: `Bearer ${refreshedToken}`, + }, + }); + } + + return response; + } + + private async ensureAccessToken(forceRefresh = false): Promise { + if (!forceRefresh && this.accessToken && Date.now() < this.tokenExpiresAt) { + return this.accessToken; + } + + if (this.refreshToken) { + try { + await this.refreshAccessToken(); + return this.accessToken; + } catch { + if (!this.email || !this.password) { + throw new Error("CereVoice refresh token is invalid or expired"); + } + } + } + + if (!this.email || !this.password) { + throw new Error("CereVoice email and password are required for authentication"); + } + + await this.login(); + return this.accessToken; + } + + private async login(): Promise { + const response = await getFetch()(this.buildUrl("/auth"), { + method: "GET", + headers: { + Authorization: `Basic ${this.encodeBasicCredentials(`${this.email}:${this.password}`)}`, + }, + }); + + if (!response.ok) { + const errorText = await this.safeReadErrorText(response); + throw new Error( + `CereVoice auth error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}` + ); + } + + const data = (await response.json()) as { access_token?: string; refresh_token?: string }; + if (!data.access_token) { + throw new Error("CereVoice auth response did not include an access token"); + } + + this.accessToken = data.access_token; + this.refreshToken = data.refresh_token || this.refreshToken; + this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS; + } + + private async refreshAccessToken(): Promise { + const response = await getFetch()( + this.buildUrl("/auth/refresh", { refresh_token: this.refreshToken }), + { + method: "GET", + } + ); + + if (!response.ok) { + const errorText = await this.safeReadErrorText(response); + throw new Error( + `CereVoice refresh error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}` + ); + } + + const data = (await response.json()) as { access_token?: string }; + if (!data.access_token) { + throw new Error("CereVoice refresh response did not include an access token"); + } + + this.accessToken = data.access_token; + this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS; + } + + private buildUrl( + path: string, + params: Record = {} + ): string { + const url = new URL(`${this.baseUrl}${path}`); + for (const [key, value] of Object.entries(params)) { + if (value !== undefined) { + url.searchParams.set(key, String(value)); + } + } + return url.toString(); + } + + private resolveAudioFormat(options: CereVoiceTTSOptions): "wav" | "mp3" | "ogg" { + const requested = options.audioFormat || options.format || this.audioFormat; + return this.isSupportedAudioFormat(requested) ? requested : this.audioFormat; + } + + private isSupportedAudioFormat(value: unknown): value is "wav" | "mp3" | "ogg" { + return typeof value === "string" && SUPPORTED_AUDIO_FORMATS.has(value); + } + + private acceptHeaderForFormat(format: "wav" | "mp3" | "ogg"): string { + switch (format) { + case "mp3": + return "audio/mpeg"; + case "ogg": + return "audio/ogg"; + case "wav": + default: + return "audio/wav"; + } + } + + private mapGender(gender: string | undefined): "Male" | "Female" | "Unknown" { + const normalized = gender?.toLowerCase(); + if (normalized === "male") { + return "Male"; + } + if (normalized === "female") { + return "Female"; + } + return "Unknown"; + } + + private isXmlLike(text: string): boolean { + return /^\s*(<\?xml||\/>))/i.test(text); + } + + private escapeXml(text: string): string { + return text + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + } + + private decodeXmlEntities(text: string): string { + return text + .replace(/'/g, "'") + .replace(/"/g, '"') + .replace(/>/g, ">") + .replace(/</g, "<") + .replace(/&/g, "&"); + } + + private getHeader(headers: Headers | undefined, name: string): string | null { + if (!headers) { + return null; + } + + if (typeof headers.get === "function") { + return headers.get(name) || headers.get(name.toLowerCase()); + } + + const record = headers as unknown as Record; + return record[name] || record[name.toLowerCase()] || null; + } + + private encodeBasicCredentials(value: string): string { + if (typeof Buffer !== "undefined") { + return Buffer.from(value, "utf8").toString("base64"); + } + + const bytes = new TextEncoder().encode(value); + let binary = ""; + for (const byte of bytes) { + binary += String.fromCharCode(byte); + } + + return btoa(binary); + } + + private async safeReadErrorText(response: { text(): Promise }): Promise { + try { + return await response.text(); + } catch { + return ""; + } + } +} diff --git a/src/factory-browser.ts b/src/factory-browser.ts index a1132ca..dcc380c 100644 --- a/src/factory-browser.ts +++ b/src/factory-browser.ts @@ -1,6 +1,7 @@ // Browser-compatible factory for TTS clients import { AzureTTSClient } from "./engines/azure.js"; import { CartesiaTTSClient } from "./engines/cartesia.js"; +import { CereVoiceTTSClient } from "./engines/cerevoice.js"; import { DeepgramTTSClient } from "./engines/deepgram.js"; import { ElevenLabsTTSClient } from "./engines/elevenlabs.js"; import { EspeakBrowserTTSClient } from "./engines/espeak-wasm.js"; @@ -41,6 +42,7 @@ try { export type SupportedBrowserTTS = | "azure" | "cartesia" + | "cerevoice" | "deepgram" | "fishaudio" | "gemini" @@ -107,6 +109,10 @@ export function createBrowserTTSClient(engine: SupportedBrowserTTS, credentials? return applyProperties( new CartesiaTTSClient(credentials as import("./engines/cartesia").CartesiaTTSCredentials) ); + case "cerevoice": + return applyProperties( + new CereVoiceTTSClient(credentials as import("./engines/cerevoice").CereVoiceTTSCredentials) + ); case "deepgram": return applyProperties( new DeepgramTTSClient(credentials as import("./engines/deepgram").DeepgramTTSCredentials) diff --git a/src/factory.ts b/src/factory.ts index 4215079..c3eb83c 100644 --- a/src/factory.ts +++ b/src/factory.ts @@ -1,6 +1,7 @@ // Factory for TTS clients (browser/server compatible) import { AzureTTSClient } from "./engines/azure.js"; import { CartesiaTTSClient } from "./engines/cartesia.js"; +import { CereVoiceTTSClient } from "./engines/cerevoice.js"; import { DeepgramTTSClient } from "./engines/deepgram.js"; import { ElevenLabsTTSClient } from "./engines/elevenlabs.js"; import { EspeakTTSClient } from "./engines/espeak.js"; @@ -44,6 +45,7 @@ try { export type SupportedTTS = | "azure" | "cartesia" + | "cerevoice" | "deepgram" | "fishaudio" | "gemini" @@ -113,6 +115,10 @@ export function createTTSClient(engine: SupportedTTS, credentials?: TTSCredentia return applyProperties( new CartesiaTTSClient(credentials as import("./engines/cartesia").CartesiaTTSCredentials) ); + case "cerevoice": + return applyProperties( + new CereVoiceTTSClient(credentials as import("./engines/cerevoice").CereVoiceTTSCredentials) + ); case "deepgram": return applyProperties( new DeepgramTTSClient(credentials as import("./engines/deepgram").DeepgramTTSCredentials) diff --git a/src/index.ts b/src/index.ts index 95e6e2a..3deab5b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -6,6 +6,7 @@ export * as VoiceUtils from "./core/voice-utils"; // Engine exports export { AzureTTSClient } from "./engines/azure"; export { CartesiaTTSClient } from "./engines/cartesia"; +export { CereVoiceTTSClient } from "./engines/cerevoice"; export { DeepgramTTSClient } from "./engines/deepgram"; export { ElevenLabsTTSClient } from "./engines/elevenlabs"; export { EspeakNodeTTSClient, EspeakTTSClient } from "./engines/espeak"; diff --git a/src/types.ts b/src/types.ts index fd2f9c5..d0a5e38 100644 --- a/src/types.ts +++ b/src/types.ts @@ -100,6 +100,7 @@ export type UnifiedVoice = { */ provider: | "azure" + | "cerevoice" | "google" | "ibm" | "elevenlabs"