diff --git a/README.md b/README.md index 1acdbb7..bdf1a8d 100644 --- a/README.md +++ b/README.md @@ -10,22 +10,38 @@ A SvelteKit application for live-streaming 3D VRM avatars with AI-powered chat, - Threlte/Three.js for 3D rendering - VRM avatar loading and animation with @pixiv/three-vrm - Google Generative AI for conversational responses -- Text-to-speech with lip-sync +- Text-to-speech with lip-sync — pluggable provider (ElevenLabs or 60db) - Chat interface - Mixamo animation integration See [roadmap.md](roadmap.md) for full details and planned features. -### Text-to-Speech and Phonemes +### Text-to-Speech and Lip-Sync -The project uses ElevenLabs TTS with phoneme timings for VRM lip-sync. +The app supports two interchangeable TTS providers, selected at runtime via the +`TTS_PROVIDER` environment variable. Both flow through the same `/api/tts` +endpoint and feed the identical lip-sync + animation pipeline, so the avatar +behaves consistently regardless of which provider is active. + +| Provider | Endpoint | Timing data | Lip-sync source | +| --- | --- | --- | --- | +| **ElevenLabs** (default) | `/v1/text-to-speech/{voice}/with-timestamps` | Per-character alignment | Phoneme-accurate mouth shapes | +| **60db** | `/tts-synthesize` | None returned | Amplitude envelope (volume-driven mouth opening) | + +**Phoneme-based lip-sync (ElevenLabs):** alignment timings are mapped to VRM +viseme blendshapes. Phonemes mapped: A, AA, AH, AE, AO, AW, AY, E, EH, ER, EY, +I, IH, IY, O, OH, OW, OY, U, UH, UW, M, B, P, F, V, TH, L, R, NEUTRAL. + +**Amplitude-based lip-sync (60db):** because 60db returns audio without timing +data, the client analyzes the decoded audio's volume envelope (per-frame RMS) +and opens the mouth proportional to loudness. Speech still animates naturally, +though visemes are generic rather than phoneme-accurate. Learn more: - [What is a Phoneme](https://elevenlabs.io/blog/what-is-a-phoneme) - [Prompting Controls](https://elevenlabs.io/docs/best-practices/prompting/controls) - -Phonemes mapped: A, AA, AH, AE, AO, AW, AY, E, EH, ER, EY, I, IH, IY, O, OH, OW, OY, U, UH, UW, M, B, P, F, V, TH, L, R, NEUTRAL. +- [60db API docs](https://docs.60db.ai/api-reference/tts/text-to-speech) ## Getting Started @@ -57,11 +73,36 @@ Set the following environment variables (for example, create a `.env` file in th # Google Generative AI GOOGLE_API_KEY=your_google_api_key +# Text-to-Speech provider selection: "elevenlabs" (default) or "60db" +TTS_PROVIDER=elevenlabs + # ElevenLabs Text-to-Speech ELEVENLABS_API_KEY=your_elevenlabs_api_key + +# 60db Text-to-Speech (used when TTS_PROVIDER=60db) +SIXTYDB_API_KEY=your_60db_api_key +# Optional: a specific 60db voice UUID (from GET /myvoices). Omit to use the account default. +SIXTYDB_VOICE_ID= +``` + +`GOOGLE_API_KEY` is required for chat. For text-to-speech, set `TTS_PROVIDER` +and provide the matching API key. + +### Switching TTS providers + +Set `TTS_PROVIDER` and supply the matching API key: + +```bash +# Use ElevenLabs (default) +TTS_PROVIDER=elevenlabs + +# Use 60db +TTS_PROVIDER=60db ``` -These keys are required for the chat and text-to-speech features. +No code changes or rebuild are needed — restart the dev server to pick up the +new value. See [Text-to-Speech and Lip-Sync](#text-to-speech-and-lip-sync) +for how each provider drives the avatar. ## Developing @@ -89,7 +130,7 @@ pnpm run preview ## Keywords -svelte, sveltekit, threejs, threlte, vrm, 3d-avatar, ai-chat, text-to-speech, lipsync, phonemes, mixamo, animations, blockchain, solana, generative-ai, youtube-streaming +svelte, sveltekit, threejs, threlte, vrm, 3d-avatar, ai-chat, text-to-speech, elevenlabs, 60db, lipsync, phonemes, mixamo, animations, blockchain, solana, generative-ai, youtube-streaming ## Contributing diff --git a/src/lib/audio/tts.ts b/src/lib/audio/tts.ts index abf5575..dbbb067 100644 --- a/src/lib/audio/tts.ts +++ b/src/lib/audio/tts.ts @@ -6,6 +6,7 @@ type PhonemeTiming = { phoneme: string; start: number; // seconds end: number; // seconds + intensity?: number; // 0..1 per-frame multiplier; used by amplitude-derived timings (60db) }; // Emotion type based on LLM output @@ -228,6 +229,51 @@ class LipSyncAnimator { } } +// --- AMPLITUDE-BASED LIP SYNC (fallback for providers without alignment, e.g. 60db) --- +/** + * Derive lip-sync timings from an audio buffer's volume envelope. + * + * Providers like 60db return audio but no per-character alignment. We slice the + * waveform into short frames, compute per-frame RMS energy, normalize it, and + * emit one timing per frame that opens the mouth (Aa) proportional to loudness. + * Near-silent frames map to NEUTRAL so the mouth closes between words. + */ +function generateTimingsFromAmplitude(audioBuffer: AudioBuffer, frameMs = 70): PhonemeTiming[] { + const channel = audioBuffer.getChannelData(0); + const sampleRate = audioBuffer.sampleRate; + const frameSize = Math.max(1, Math.floor((sampleRate * frameMs) / 1000)); + + // Per-frame RMS energy. + const rmsValues: number[] = []; + for (let i = 0; i < channel.length; i += frameSize) { + const end = Math.min(i + frameSize, channel.length); + let sumSquares = 0; + for (let j = i; j < end; j++) { + sumSquares += channel[j] * channel[j]; + } + rmsValues.push(Math.sqrt(sumSquares / (end - i))); + } + + const maxRms = Math.max(1e-6, ...rmsValues); + + const SILENCE_THRESHOLD = 0.08; // normalized; below this the mouth closes + const timings: PhonemeTiming[] = []; + for (let f = 0; f < rmsValues.length; f++) { + const norm = rmsValues[f] / maxRms; // 0..1 + const start = (f * frameSize) / sampleRate; + const end = Math.min(((f + 1) * frameSize) / sampleRate, audioBuffer.duration); + timings.push({ + phoneme: norm < SILENCE_THRESHOLD ? 'NEUTRAL' : 'A', + start, + end, + intensity: norm + }); + } + + console.log(`[TTS] Derived ${timings.length} amplitude-based timings (no provider alignment)`); + return timings; +} + // --- TTS WITH PHONEMES --- async function fetchSpeechWithPhonemes( text: string @@ -288,11 +334,17 @@ async function fetchSpeechWithPhonemes( }); } - console.log(`[TTS] Extracted ${timings.length} phoneme timings`); + console.log(`[TTS] Extracted ${timings.length} phoneme timings (provider: ${data.provider})`); if (timings.length > 0) { console.log('[TTS] First few timings:', timings.slice(0, 5)); } + // Providers without alignment (e.g. 60db) return no phonemes — derive lip-sync + // timings from the decoded audio's amplitude envelope so behaviour stays consistent. + if (timings.length === 0) { + return { audioBuffer, timings: generateTimingsFromAmplitude(audioBuffer) }; + } + return { audioBuffer, timings }; } @@ -362,11 +414,15 @@ export async function speakWithLipsync( console.log(`[TTS] Playing audio and ${timings.length} phoneme animations`); } - // Schedule all lip sync animations using the improved phoneme mapping - timings.forEach(({ phoneme, start, end }) => { + // Schedule all lip sync animations using the improved phoneme mapping. + // `intensity` (0..1) is present on amplitude-derived timings (60db) and + // scales the mouth-open amount per frame; alignment timings (ElevenLabs) + // leave it undefined, so the configured intensity is used as-is. + timings.forEach(({ phoneme, start, intensity }) => { const expressionWeights = phonemeToVRM[phoneme] || phonemeToVRM['NEUTRAL']; + const frameIntensity = finalConfig.intensity * (intensity ?? 1); setTimeout(() => { - animator.setBlendedExpression(expressionWeights, finalConfig.intensity); + animator.setBlendedExpression(expressionWeights, frameIntensity); }, start * 1000); }); diff --git a/src/lib/server/tts.ts b/src/lib/server/tts.ts new file mode 100644 index 0000000..092b46c --- /dev/null +++ b/src/lib/server/tts.ts @@ -0,0 +1,169 @@ +import { env } from '$env/dynamic/private'; + +/** + * Server-side TTS provider abstraction. + * + * Both providers return the same shape so the client lip-sync pipeline is + * provider-agnostic. ElevenLabs supplies real per-character alignment in + * `phonemes`; 60db returns no timing data, so `phonemes` is empty and the + * client derives lip-sync timings from the audio amplitude envelope instead. + */ + +export interface TTSPhoneme { + character: string; + start: number; // seconds + end: number; // seconds +} + +export interface TTSResult { + /** Base64-encoded audio (mp3). */ + audioBase64: string; + /** Per-character timings. Empty for providers without alignment (e.g. 60db). */ + phonemes: TTSPhoneme[]; + /** Which provider produced this result. */ + provider: 'elevenlabs' | '60db'; +} + +/** Raised by providers on a recoverable failure; carries an HTTP-ish status. */ +export class TTSProviderError extends Error { + status: number; + constructor(message: string, status = 502) { + super(message); + this.name = 'TTSProviderError'; + this.status = status; + } +} + +// Non-premium ElevenLabs voice (premium voices are ~10x the cost). +const ELEVENLABS_VOICE_ID = '3XOBzXhnDY98yeWQ3GdM'; + +// Shared, provider-neutral voice tuning (0..1, ElevenLabs scale). +const STABILITY = 0.5; +const SIMILARITY = 0.75; + +/** + * Dispatch to the configured provider. Selected via the `TTS_PROVIDER` env var + * ("elevenlabs" | "60db"); defaults to ElevenLabs to preserve prior behaviour. + */ +export async function synthesizeSpeech(text: string): Promise { + const provider = (env.TTS_PROVIDER || 'elevenlabs').toLowerCase(); + + if (provider === '60db' || provider === 'sixtydb') { + return synthesizeWith60db(text); + } + return synthesizeWithElevenLabs(text); +} + +// --- ElevenLabs -------------------------------------------------------------- + +async function synthesizeWithElevenLabs(text: string): Promise { + const apiKey = env.ELEVENLABS_API_KEY; + if (!apiKey) { + throw new TTSProviderError('ELEVENLABS_API_KEY is not set.', 500); + } + + const endpoint = `https://api.elevenlabs.io/v1/text-to-speech/${ELEVENLABS_VOICE_ID}/with-timestamps`; + + const response = await fetch(endpoint, { + method: 'POST', + headers: { + 'xi-api-key': apiKey, + 'Content-Type': 'application/json', + Accept: 'application/json' + }, + body: JSON.stringify({ + text, + model_id: 'eleven_flash_v2_5', + voice_settings: { + stability: STABILITY, + similarity_boost: SIMILARITY + } + }) + }); + + if (!response.ok) { + const errorBody = await response.text(); + console.error(`ElevenLabs API error: ${response.status} ${response.statusText}`, errorBody); + throw new TTSProviderError(`ElevenLabs request failed: ${response.statusText} - ${errorBody}`); + } + + const data = await response.json(); + + const phonemes: TTSPhoneme[] = []; + const alignment = data.alignment; + if ( + alignment?.characters && + alignment?.character_start_times_seconds && + alignment?.character_end_times_seconds + ) { + const { characters } = alignment; + const startTimes = alignment.character_start_times_seconds; + const endTimes = alignment.character_end_times_seconds; + for (let i = 0; i < characters.length; i++) { + phonemes.push({ character: characters[i], start: startTimes[i], end: endTimes[i] }); + } + } else { + console.warn('[TTS:elevenlabs] Unexpected response structure (no alignment).'); + } + + console.log( + `[TTS:elevenlabs] audio present: ${!!data.audio_base64}, phonemes: ${phonemes.length}` + ); + + return { audioBase64: data.audio_base64, phonemes, provider: 'elevenlabs' }; +} + +// --- 60db -------------------------------------------------------------------- + +async function synthesizeWith60db(text: string): Promise { + const apiKey = env.SIXTYDB_API_KEY; + if (!apiKey) { + throw new TTSProviderError('SIXTYDB_API_KEY is not set.', 500); + } + + const endpoint = 'https://api.60db.ai/tts-synthesize'; + + const body: Record = { + text, + enhance: true, + speed: 1, + // 60db uses a 0..100 scale; map from the shared 0..1 tuning above. + stability: Math.round(STABILITY * 100), + similarity: Math.round(SIMILARITY * 100), + output_format: 'mp3' + }; + // Optional explicit voice (UUID from GET /myvoices); otherwise the account default is used. + if (env.SIXTYDB_VOICE_ID) { + body.voice_id = env.SIXTYDB_VOICE_ID; + } + + const response = await fetch(endpoint, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body) + }); + + if (!response.ok) { + const errorBody = await response.text(); + console.error(`60db API error: ${response.status} ${response.statusText}`, errorBody); + throw new TTSProviderError(`60db request failed: ${response.statusText} - ${errorBody}`); + } + + const data = await response.json(); + + if (!data.audio_base64) { + console.error('[TTS:60db] No audio_base64 in response.', data?.message); + throw new TTSProviderError(`60db returned no audio: ${data?.message || 'unknown error'}`); + } + + console.log( + `[TTS:60db] audio present: true, duration: ${data.duration_seconds ?? '?'}s ` + + `(no alignment — client derives lip-sync from amplitude)` + ); + + // 60db has no alignment endpoint; phonemes intentionally empty. + return { audioBase64: data.audio_base64, phonemes: [], provider: '60db' }; +} diff --git a/src/routes/api/tts/+server.ts b/src/routes/api/tts/+server.ts index cfc9c16..7ddac5d 100644 --- a/src/routes/api/tts/+server.ts +++ b/src/routes/api/tts/+server.ts @@ -1,27 +1,12 @@ import { json, error } from '@sveltejs/kit'; import type { RequestHandler } from './$types'; -import { env } from '$env/dynamic/private'; +import { synthesizeSpeech, TTSProviderError } from '$lib/server/tts'; interface TTSRequestBody { text: string; } export const POST: RequestHandler = async ({ request }) => { - const apiKey = env.ELEVENLABS_API_KEY; - // setted up a default (non-premium) voice id for cheaper tts - // Premium voices are 10x more expensive than non-premium voices - const voiceId = '3XOBzXhnDY98yeWQ3GdM'; - - if (!apiKey) { - console.error('ELEVENLABS_API_KEY is not set in environment variables.'); - throw error(500, 'TTS API key not configured. Please check server logs.'); - } - - if (!voiceId) { - console.error('ELEVENLABS_VOICE_ID is not set in environment variables.'); - throw error(500, 'TTS Voice ID not configured. Please check server logs.'); - } - let requestData: TTSRequestBody; try { requestData = await request.json(); @@ -36,81 +21,19 @@ export const POST: RequestHandler = async ({ request }) => { } try { - // Use the phoneme timing endpoint if phonemes are requested - const endpoint = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`; - - const elevenLabsResponse = await fetch(endpoint, { - method: 'POST', - headers: { - 'xi-api-key': apiKey, - 'Content-Type': 'application/json', - Accept: 'application/json' - }, - body: JSON.stringify({ - text: text, - model_id: 'eleven_flash_v2_5', - voice_settings: { - stability: 0.5, - similarity_boost: 0.75 - } - }) - }); - - if (!elevenLabsResponse.ok) { - const errorBody = await elevenLabsResponse.text(); - console.error( - `ElevenLabs API error: ${elevenLabsResponse.status} ${elevenLabsResponse.statusText}`, - errorBody - ); - throw error( - 502, - `Failed to fetch audio from ElevenLabs: ${elevenLabsResponse.statusText} - ${errorBody}` - ); - } - - // Handle JSON response with phoneme data - const responseData = await elevenLabsResponse.json(); - - // Simple debug logging - console.log('[TTS API] Audio present:', !!responseData.audio_base64); - console.log('[TTS API] Characters count:', responseData.alignment?.characters?.length || 0); - - // Extract phonemes from ElevenLabs response - let phonemes = []; - - if ( - responseData.alignment?.characters && - responseData.alignment?.character_start_times_seconds && - responseData.alignment?.character_end_times_seconds - ) { - const characters = responseData.alignment.characters; - const startTimes = responseData.alignment.character_start_times_seconds; - const endTimes = responseData.alignment.character_end_times_seconds; - - for (let i = 0; i < characters.length; i++) { - phonemes.push({ - character: characters[i], - start: startTimes[i], - end: endTimes[i] - }); - } - } else { - console.warn('[TTS API] Unexpected response structure from ElevenLabs'); - } - - console.log('[TTS API] Extracted phonemes count:', phonemes.length); - if (phonemes.length > 0) { - console.log('[TTS API] Sample phoneme structure:', phonemes[0]); - } + const result = await synthesizeSpeech(text); return json({ - audio_base64: responseData.audio_base64, - phonemes: phonemes + audio_base64: result.audioBase64, + provider: result.provider, + // Client shape: { character, start, end }. Empty for providers without + // alignment (60db) — the client then derives timings from amplitude. + phonemes: result.phonemes }); } catch (e: any) { - console.error('Error proxying TTS request to ElevenLabs:', e); - if (e.status && e.body) { - throw e; + console.error('Error proxying TTS request:', e); + if (e instanceof TTSProviderError) { + throw error(e.status, e.message); } throw error(500, `Internal server error: ${e.message || 'Unknown error'}`); }