dexvdev · manishEMS47 · Jun 15, 2026
diff --git a/README.md b/README.md
@@ -10,22 +10,38 @@ A SvelteKit application for live-streaming 3D VRM avatars with AI-powered chat,
 - Threlte/Three.js for 3D rendering
 - VRM avatar loading and animation with @pixiv/three-vrm
 - Google Generative AI for conversational responses
-- Text-to-speech with lip-sync
+- Text-to-speech with lip-sync — pluggable provider (ElevenLabs or 60db)
 - Chat interface
 - Mixamo animation integration
 
 See [roadmap.md](roadmap.md) for full details and planned features.
 
-### Text-to-Speech and Phonemes
+### Text-to-Speech and Lip-Sync
 
-The project uses ElevenLabs TTS with phoneme timings for VRM lip-sync.
+The app supports two interchangeable TTS providers, selected at runtime via the
+`TTS_PROVIDER` environment variable. Both flow through the same `/api/tts`
+endpoint and feed the identical lip-sync + animation pipeline, so the avatar
+behaves consistently regardless of which provider is active.
+
+| Provider | Endpoint | Timing data | Lip-sync source |
+| --- | --- | --- | --- |
+| **ElevenLabs** (default) | `/v1/text-to-speech/{voice}/with-timestamps` | Per-character alignment | Phoneme-accurate mouth shapes |
+| **60db** | `/tts-synthesize` | None returned | Amplitude envelope (volume-driven mouth opening) |
+
+**Phoneme-based lip-sync (ElevenLabs):** alignment timings are mapped to VRM
+viseme blendshapes. Phonemes mapped: A, AA, AH, AE, AO, AW, AY, E, EH, ER, EY,
+I, IH, IY, O, OH, OW, OY, U, UH, UW, M, B, P, F, V, TH, L, R, NEUTRAL.
+
+**Amplitude-based lip-sync (60db):** because 60db returns audio without timing
+data, the client analyzes the decoded audio's volume envelope (per-frame RMS)
+and opens the mouth proportional to loudness. Speech still animates naturally,
+though visemes are generic rather than phoneme-accurate.
 
 Learn more:
 
 - [What is a Phoneme](https://elevenlabs.io/blog/what-is-a-phoneme)
 - [Prompting Controls](https://elevenlabs.io/docs/best-practices/prompting/controls)
-
-Phonemes mapped: A, AA, AH, AE, AO, AW, AY, E, EH, ER, EY, I, IH, IY, O, OH, OW, OY, U, UH, UW, M, B, P, F, V, TH, L, R, NEUTRAL.
+- [60db API docs](https://docs.60db.ai/api-reference/tts/text-to-speech)
 
 ## Getting Started
 
@@ -57,11 +73,36 @@ Set the following environment variables (for example, create a `.env` file in th
 # Google Generative AI
 GOOGLE_API_KEY=your_google_api_key
 
+# Text-to-Speech provider selection: "elevenlabs" (default) or "60db"
+TTS_PROVIDER=elevenlabs
+
 # ElevenLabs Text-to-Speech
 ELEVENLABS_API_KEY=your_elevenlabs_api_key
+
+# 60db Text-to-Speech (used when TTS_PROVIDER=60db)
+SIXTYDB_API_KEY=your_60db_api_key
+# Optional: a specific 60db voice UUID (from GET /myvoices). Omit to use the account default.
+SIXTYDB_VOICE_ID=
+```
+
+`GOOGLE_API_KEY` is required for chat. For text-to-speech, set `TTS_PROVIDER`
+and provide the matching API key.
+
+### Switching TTS providers
+
+Set `TTS_PROVIDER` and supply the matching API key:
+
+```bash
+# Use ElevenLabs (default)
+TTS_PROVIDER=elevenlabs
+
+# Use 60db
+TTS_PROVIDER=60db
 ```
 
-These keys are required for the chat and text-to-speech features.
+No code changes or rebuild are needed — restart the dev server to pick up the
+new value. See [Text-to-Speech and Lip-Sync](#text-to-speech-and-lip-sync)
+for how each provider drives the avatar.
 
 ## Developing
 
@@ -89,7 +130,7 @@ pnpm run preview
 
 ## Keywords
 
-svelte, sveltekit, threejs, threlte, vrm, 3d-avatar, ai-chat, text-to-speech, lipsync, phonemes, mixamo, animations, blockchain, solana, generative-ai, youtube-streaming
+svelte, sveltekit, threejs, threlte, vrm, 3d-avatar, ai-chat, text-to-speech, elevenlabs, 60db, lipsync, phonemes, mixamo, animations, blockchain, solana, generative-ai, youtube-streaming
 
 ## Contributing
 

diff --git a/src/lib/audio/tts.ts b/src/lib/audio/tts.ts
@@ -6,6 +6,7 @@ type PhonemeTiming = {
 	phoneme: string;
 	start: number; // seconds
 	end: number; // seconds
+	intensity?: number; // 0..1 per-frame multiplier; used by amplitude-derived timings (60db)
 };
 
 // Emotion type based on LLM output
@@ -228,6 +229,51 @@ class LipSyncAnimator {
 	}
 }
 
+// --- AMPLITUDE-BASED LIP SYNC (fallback for providers without alignment, e.g. 60db) ---
+/**
+ * Derive lip-sync timings from an audio buffer's volume envelope.
+ *
+ * Providers like 60db return audio but no per-character alignment. We slice the
+ * waveform into short frames, compute per-frame RMS energy, normalize it, and
+ * emit one timing per frame that opens the mouth (Aa) proportional to loudness.
+ * Near-silent frames map to NEUTRAL so the mouth closes between words.
+ */
+function generateTimingsFromAmplitude(audioBuffer: AudioBuffer, frameMs = 70): PhonemeTiming[] {
+	const channel = audioBuffer.getChannelData(0);
+	const sampleRate = audioBuffer.sampleRate;
+	const frameSize = Math.max(1, Math.floor((sampleRate * frameMs) / 1000));
+
+	// Per-frame RMS energy.
+	const rmsValues: number[] = [];
+	for (let i = 0; i < channel.length; i += frameSize) {
+		const end = Math.min(i + frameSize, channel.length);
+		let sumSquares = 0;
+		for (let j = i; j < end; j++) {
+			sumSquares += channel[j] * channel[j];
+		}
+		rmsValues.push(Math.sqrt(sumSquares / (end - i)));
+	}
+
+	const maxRms = Math.max(1e-6, ...rmsValues);
+
+	const SILENCE_THRESHOLD = 0.08; // normalized; below this the mouth closes
+	const timings: PhonemeTiming[] = [];
+	for (let f = 0; f < rmsValues.length; f++) {
+		const norm = rmsValues[f] / maxRms; // 0..1
+		const start = (f * frameSize) / sampleRate;
+		const end = Math.min(((f + 1) * frameSize) / sampleRate, audioBuffer.duration);
+		timings.push({
+			phoneme: norm < SILENCE_THRESHOLD ? 'NEUTRAL' : 'A',
+			start,
+			end,
+			intensity: norm
+		});
+	}
+
+	console.log(`[TTS] Derived ${timings.length} amplitude-based timings (no provider alignment)`);
+	return timings;
+}
+
 // --- TTS WITH PHONEMES ---
 async function fetchSpeechWithPhonemes(
 	text: string
@@ -288,11 +334,17 @@ async function fetchSpeechWithPhonemes(
 		});
 	}
 
-	console.log(`[TTS] Extracted ${timings.length} phoneme timings`);
+	console.log(`[TTS] Extracted ${timings.length} phoneme timings (provider: ${data.provider})`);
 	if (timings.length > 0) {
 		console.log('[TTS] First few timings:', timings.slice(0, 5));
 	}
 
+	// Providers without alignment (e.g. 60db) return no phonemes — derive lip-sync
+	// timings from the decoded audio's amplitude envelope so behaviour stays consistent.
+	if (timings.length === 0) {
+		return { audioBuffer, timings: generateTimingsFromAmplitude(audioBuffer) };
+	}
+
 	return { audioBuffer, timings };
 }
 
@@ -362,11 +414,15 @@ export async function speakWithLipsync(
 				console.log(`[TTS] Playing audio and ${timings.length} phoneme animations`);
 			}
 
-			// Schedule all lip sync animations using the improved phoneme mapping
-			timings.forEach(({ phoneme, start, end }) => {
+			// Schedule all lip sync animations using the improved phoneme mapping.
+			// `intensity` (0..1) is present on amplitude-derived timings (60db) and
+			// scales the mouth-open amount per frame; alignment timings (ElevenLabs)
+			// leave it undefined, so the configured intensity is used as-is.
+			timings.forEach(({ phoneme, start, intensity }) => {
 				const expressionWeights = phonemeToVRM[phoneme] || phonemeToVRM['NEUTRAL'];
+				const frameIntensity = finalConfig.intensity * (intensity ?? 1);
 				setTimeout(() => {
-					animator.setBlendedExpression(expressionWeights, finalConfig.intensity);
+					animator.setBlendedExpression(expressionWeights, frameIntensity);
 				}, start * 1000);
 			});
 

diff --git a/src/lib/server/tts.ts b/src/lib/server/tts.ts
@@ -0,0 +1,169 @@
+import { env } from '$env/dynamic/private';
+
+/**
+ * Server-side TTS provider abstraction.
+ *
+ * Both providers return the same shape so the client lip-sync pipeline is
+ * provider-agnostic. ElevenLabs supplies real per-character alignment in
+ * `phonemes`; 60db returns no timing data, so `phonemes` is empty and the
+ * client derives lip-sync timings from the audio amplitude envelope instead.
+ */
+
+export interface TTSPhoneme {
+	character: string;
+	start: number; // seconds
+	end: number; // seconds
+}
+
+export interface TTSResult {
+	/** Base64-encoded audio (mp3). */
+	audioBase64: string;
+	/** Per-character timings. Empty for providers without alignment (e.g. 60db). */
+	phonemes: TTSPhoneme[];
+	/** Which provider produced this result. */
+	provider: 'elevenlabs' | '60db';
+}
+
+/** Raised by providers on a recoverable failure; carries an HTTP-ish status. */
+export class TTSProviderError extends Error {
+	status: number;
+	constructor(message: string, status = 502) {
+		super(message);
+		this.name = 'TTSProviderError';
+		this.status = status;
+	}
+}
+
+// Non-premium ElevenLabs voice (premium voices are ~10x the cost).
+const ELEVENLABS_VOICE_ID = '3XOBzXhnDY98yeWQ3GdM';
+
+// Shared, provider-neutral voice tuning (0..1, ElevenLabs scale).
+const STABILITY = 0.5;
+const SIMILARITY = 0.75;
+
+/**
+ * Dispatch to the configured provider. Selected via the `TTS_PROVIDER` env var
+ * ("elevenlabs" | "60db"); defaults to ElevenLabs to preserve prior behaviour.
+ */
+export async function synthesizeSpeech(text: string): Promise<TTSResult> {
+	const provider = (env.TTS_PROVIDER || 'elevenlabs').toLowerCase();
+
+	if (provider === '60db' || provider === 'sixtydb') {
+		return synthesizeWith60db(text);
+	}
+	return synthesizeWithElevenLabs(text);
+}
+
+// --- ElevenLabs --------------------------------------------------------------
+
+async function synthesizeWithElevenLabs(text: string): Promise<TTSResult> {
+	const apiKey = env.ELEVENLABS_API_KEY;
+	if (!apiKey) {
+		throw new TTSProviderError('ELEVENLABS_API_KEY is not set.', 500);
+	}
+
+	const endpoint = `https://api.elevenlabs.io/v1/text-to-speech/${ELEVENLABS_VOICE_ID}/with-timestamps`;
+
+	const response = await fetch(endpoint, {
+		method: 'POST',
+		headers: {
+			'xi-api-key': apiKey,
+			'Content-Type': 'application/json',
+			Accept: 'application/json'
+		},
+		body: JSON.stringify({
+			text,
+			model_id: 'eleven_flash_v2_5',
+			voice_settings: {
+				stability: STABILITY,
+				similarity_boost: SIMILARITY
+			}
+		})
+	});
+
+	if (!response.ok) {
+		const errorBody = await response.text();
+		console.error(`ElevenLabs API error: ${response.status} ${response.statusText}`, errorBody);
+		throw new TTSProviderError(`ElevenLabs request failed: ${response.statusText} - ${errorBody}`);
+	}
+
+	const data = await response.json();
+
+	const phonemes: TTSPhoneme[] = [];
+	const alignment = data.alignment;
+	if (
+		alignment?.characters &&
+		alignment?.character_start_times_seconds &&
+		alignment?.character_end_times_seconds
+	) {
+		const { characters } = alignment;
+		const startTimes = alignment.character_start_times_seconds;
+		const endTimes = alignment.character_end_times_seconds;
+		for (let i = 0; i < characters.length; i++) {
+			phonemes.push({ character: characters[i], start: startTimes[i], end: endTimes[i] });
+		}
+	} else {
+		console.warn('[TTS:elevenlabs] Unexpected response structure (no alignment).');
+	}
+
+	console.log(
+		`[TTS:elevenlabs] audio present: ${!!data.audio_base64}, phonemes: ${phonemes.length}`
+	);
+
+	return { audioBase64: data.audio_base64, phonemes, provider: 'elevenlabs' };
+}
+
+// --- 60db --------------------------------------------------------------------
+
+async function synthesizeWith60db(text: string): Promise<TTSResult> {
+	const apiKey = env.SIXTYDB_API_KEY;
+	if (!apiKey) {
+		throw new TTSProviderError('SIXTYDB_API_KEY is not set.', 500);
+	}
+
+	const endpoint = 'https://api.60db.ai/tts-synthesize';
+
+	const body: Record<string, unknown> = {
+		text,
+		enhance: true,
+		speed: 1,
+		// 60db uses a 0..100 scale; map from the shared 0..1 tuning above.
+		stability: Math.round(STABILITY * 100),
+		similarity: Math.round(SIMILARITY * 100),
+		output_format: 'mp3'
+	};
+	// Optional explicit voice (UUID from GET /myvoices); otherwise the account default is used.
+	if (env.SIXTYDB_VOICE_ID) {
+		body.voice_id = env.SIXTYDB_VOICE_ID;
+	}
+
+	const response = await fetch(endpoint, {
+		method: 'POST',
+		headers: {
+			Authorization: `Bearer ${apiKey}`,
+			'Content-Type': 'application/json'
+		},
+		body: JSON.stringify(body)
+	});
+
+	if (!response.ok) {
+		const errorBody = await response.text();
+		console.error(`60db API error: ${response.status} ${response.statusText}`, errorBody);
+		throw new TTSProviderError(`60db request failed: ${response.statusText} - ${errorBody}`);
+	}
+
+	const data = await response.json();
+
+	if (!data.audio_base64) {
+		console.error('[TTS:60db] No audio_base64 in response.', data?.message);
+		throw new TTSProviderError(`60db returned no audio: ${data?.message || 'unknown error'}`);
+	}
+
+	console.log(
+		`[TTS:60db] audio present: true, duration: ${data.duration_seconds ?? '?'}s ` +
+			`(no alignment — client derives lip-sync from amplitude)`
+	);
+
+	// 60db has no alignment endpoint; phonemes intentionally empty.
+	return { audioBase64: data.audio_base64, phonemes: [], provider: '60db' };
+}