diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..03e878c --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +.venv/ +*.egg-info/ + +# UV lockfile (optional — commit if you want reproducible installs) +# uv.lock + +# Custom voice embeddings — generated locally, not for distribution +server/voices/*.safetensors +server/voices/*.wav + +# Model weights cache (downloaded at runtime by pocket-tts) +*.safetensors +!server/voices.json + +# macOS +.DS_Store + +# Editor +.idea/ +.vscode/ +*.swp + +# Logs and temp output +*.log +output.wav +/tmp/ diff --git a/README.md b/README.md index c748595..7d00159 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ A Chrome extension that reads web page content aloud using [Pocket TTS](https:// - Read any web page content aloud - **Paragraph-by-paragraph processing** - audio starts playing quickly even for long documents -- Multiple voice options (8 different voices) +- Multiple voice options (8 built-in voices + support for custom cloned voices) - Automatic content extraction (focuses on main article content) - Simple playback controls (play/stop) - Works entirely locally - no cloud services required @@ -98,6 +98,23 @@ Click "Stop" to stop playback at any time. - **Eponine** - Female voice - **Azelma** - Female voice +## Adding a Custom Voice + +You can add your own cloned voice by placing a `.safetensors` embedding in `server/voices/` and registering it in `server/voices.json`: + +```json +{ "id": "my_voice", "name": "My Voice", "path": "voices/my_voice.safetensors" } +``` + +Restart the server — the voice will appear in the extension dropdown automatically. + +To generate a `.safetensors` from a WAV sample (requires access to [kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts)): + +```bash +cd server +uv run ../create_voice_embedding.py voices/my_voice.wav voices/my_voice.safetensors +``` + ## API Endpoints The TTS server provides the following endpoints: diff --git a/extension/popup.html b/extension/popup.html index a45ce0f..351af39 100644 --- a/extension/popup.html +++ b/extension/popup.html @@ -21,14 +21,7 @@

Pocket Reader

diff --git a/extension/popup.js b/extension/popup.js index c967bab..defecc5 100644 --- a/extension/popup.js +++ b/extension/popup.js @@ -33,18 +33,15 @@ let scannedParagraphs = []; * Initialize popup */ async function init() { - // Load saved voice preference const { voice, speed } = await chrome.storage.local.get(['voice', 'speed']); - if (voice) { - voiceSelect.value = voice; - } if (speed) { speedControl.value = speed; speedValue.textContent = `${speed}x`; } - // Check server status + // Check server status and populate voices from config await checkServerStatus(); + await populateVoices(voice); // Set up event listeners voiceSelect.addEventListener('change', saveVoicePreference); @@ -105,6 +102,35 @@ function setServerStatus(status, text) { serverStatus.querySelector('.status-text').textContent = text; } +/** + * Fetch voices from server and populate the dropdown. + * Falls back to the saved preference once options are built. + */ +async function populateVoices(savedVoice) { + try { + const response = await fetch(`${SERVER_URL}/voices`, { + signal: AbortSignal.timeout(3000) + }); + const data = await response.json(); + const voices = data.voices || []; + const defaultVoice = data.default || (voices[0] && voices[0].id); + + voiceSelect.innerHTML = ''; + for (const v of voices) { + const opt = document.createElement('option'); + opt.value = v.id; + opt.textContent = v.name; + voiceSelect.appendChild(opt); + } + + voiceSelect.value = savedVoice && voices.some(v => v.id === savedVoice) + ? savedVoice + : defaultVoice; + } catch { + // Server offline — leave dropdown empty, user will see disconnected status + } +} + /** * Save voice preference */ diff --git a/server/server.py b/server/server.py index a1a4e4a..4aa834e 100644 --- a/server/server.py +++ b/server/server.py @@ -7,9 +7,12 @@ import io import json +import os import re import wave import base64 +import safetensors +import safetensors.torch from flask import Flask, request, jsonify, Response from flask_cors import CORS import numpy as np @@ -21,8 +24,19 @@ _tts_model = None _voice_states = {} -# Available voices (these are the predefined catalog voices) -AVAILABLE_VOICES = ["alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma"] +_SERVER_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Load voice config from voices.json +with open(os.path.join(_SERVER_DIR, "voices.json")) as _f: + _voices_config = json.load(_f) + +VOICES_LIST = _voices_config["voices"] # [{"id": ..., "name": ..., "path": ...}, ...] +DEFAULT_VOICE = _voices_config.get("default", "alba") +AVAILABLE_VOICES = [v["id"] for v in VOICES_LIST] +CUSTOM_VOICE_PATHS = { + v["id"]: os.path.join(_SERVER_DIR, v["path"]) + for v in VOICES_LIST if "path" in v +} SMART_QUOTE_MAP = str.maketrans({ @@ -48,14 +62,44 @@ def get_model(): return _tts_model +def load_custom_voice_state(safetensors_path: str) -> dict: + """Load a pre-computed voice state from a .safetensors file. + + 1. Calls init_states to register _module_absolute_name on every StatefulModule. + 2. Loads current_end as-is (tracks how many audio frames were processed). + 3. Copies saved cache into the first N slots of the fresh 1000-slot cache — + the saved cache may be smaller (e.g. 100 slots) and full, so we cannot + assign it directly; we need room for generation. + """ + from pocket_tts.modules.stateful_module import init_states + model = get_model() + state = init_states(model.flow_lm, batch_size=1, sequence_length=1000) + with safetensors.safe_open(safetensors_path, framework="pt") as f: + for key in f.keys(): + module_name, tensor_key = key.split("/", 1) + if module_name not in state: + continue + tensor = f.get_tensor(key) + if tensor_key == "cache": + # saved cache shape: [2, batch, n_slots, heads, d] + # fresh cache shape: [2, batch, 1000, heads, d] + n = tensor.shape[2] + state[module_name]["cache"][:, :, :n, :, :] = tensor + else: + state[module_name][tensor_key] = tensor + return state + + def get_voice_state(voice_name: str): """Get or create a voice state for the given voice.""" global _voice_states if voice_name not in _voice_states: - model = get_model() - # Use the voice name directly - pocket_tts handles the predefined voices print(f"Loading voice: {voice_name}...") - _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name) + if voice_name in CUSTOM_VOICE_PATHS: + _voice_states[voice_name] = load_custom_voice_state(CUSTOM_VOICE_PATHS[voice_name]) + else: + model = get_model() + _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name) print(f"Voice {voice_name} loaded!") return _voice_states[voice_name] @@ -164,8 +208,8 @@ def health(): def list_voices(): """List available voices.""" return jsonify({ - "voices": AVAILABLE_VOICES, - "default": "alba" + "voices": VOICES_LIST, + "default": DEFAULT_VOICE }) @@ -222,16 +266,16 @@ def synthesize(): return jsonify({"error": "Missing 'text' field"}), 400 text = data['text'] - voice = data.get('voice', 'alba') - + voice = data.get('voice', DEFAULT_VOICE) + if not text.strip(): return jsonify({"error": "Text cannot be empty"}), 400 text = normalize_smart_quotes(text) - + if voice not in AVAILABLE_VOICES: - voice = 'alba' - + voice = DEFAULT_VOICE + try: model = get_model() voice_state = get_voice_state(voice) @@ -274,7 +318,7 @@ def synthesize_stream(): return jsonify({"error": "Missing 'text' field"}), 400 text = data['text'] - voice = data.get('voice', 'alba') + voice = data.get('voice', DEFAULT_VOICE) if not text.strip(): return jsonify({"error": "Text cannot be empty"}), 400 @@ -282,7 +326,7 @@ def synthesize_stream(): text = normalize_smart_quotes(text) if voice not in AVAILABLE_VOICES: - voice = 'alba' + voice = DEFAULT_VOICE def generate_stream(): try: diff --git a/server/voices.json b/server/voices.json new file mode 100644 index 0000000..dcd33b7 --- /dev/null +++ b/server/voices.json @@ -0,0 +1,14 @@ +{ + "voices": [ + { "id": "alba", "name": "Alba" }, + { "id": "marius", "name": "Marius" }, + { "id": "javert", "name": "Javert" }, + { "id": "jean", "name": "Jean" }, + { "id": "fantine", "name": "Fantine" }, + { "id": "cosette", "name": "Cosette" }, + { "id": "eponine", "name": "Eponine" }, + { "id": "azelma", "name": "Azelma" }, + { "id": "example_custom", "name": "Example Custom", "path": "voices/example_custom.safetensors" } + ], + "default": "alba" +}