diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..03e878c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,30 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+.venv/
+*.egg-info/
+
+# UV lockfile (optional — commit if you want reproducible installs)
+# uv.lock
+
+# Custom voice embeddings — generated locally, not for distribution
+server/voices/*.safetensors
+server/voices/*.wav
+
+# Model weights cache (downloaded at runtime by pocket-tts)
+*.safetensors
+!server/voices.json
+
+# macOS
+.DS_Store
+
+# Editor
+.idea/
+.vscode/
+*.swp
+
+# Logs and temp output
+*.log
+output.wav
+/tmp/
diff --git a/README.md b/README.md
index c748595..7d00159 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ A Chrome extension that reads web page content aloud using [Pocket TTS](https://
- Read any web page content aloud
- **Paragraph-by-paragraph processing** - audio starts playing quickly even for long documents
-- Multiple voice options (8 different voices)
+- Multiple voice options (8 built-in voices + support for custom cloned voices)
- Automatic content extraction (focuses on main article content)
- Simple playback controls (play/stop)
- Works entirely locally - no cloud services required
@@ -98,6 +98,23 @@ Click "Stop" to stop playback at any time.
- **Eponine** - Female voice
- **Azelma** - Female voice
+## Adding a Custom Voice
+
+You can add your own cloned voice by placing a `.safetensors` embedding in `server/voices/` and registering it in `server/voices.json`:
+
+```json
+{ "id": "my_voice", "name": "My Voice", "path": "voices/my_voice.safetensors" }
+```
+
+Restart the server — the voice will appear in the extension dropdown automatically.
+
+To generate a `.safetensors` from a WAV sample (requires access to [kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts)):
+
+```bash
+cd server
+uv run ../create_voice_embedding.py voices/my_voice.wav voices/my_voice.safetensors
+```
+
## API Endpoints
The TTS server provides the following endpoints:
diff --git a/extension/popup.html b/extension/popup.html
index a45ce0f..351af39 100644
--- a/extension/popup.html
+++ b/extension/popup.html
@@ -21,14 +21,7 @@
Pocket Reader
diff --git a/extension/popup.js b/extension/popup.js
index c967bab..defecc5 100644
--- a/extension/popup.js
+++ b/extension/popup.js
@@ -33,18 +33,15 @@ let scannedParagraphs = [];
* Initialize popup
*/
async function init() {
- // Load saved voice preference
const { voice, speed } = await chrome.storage.local.get(['voice', 'speed']);
- if (voice) {
- voiceSelect.value = voice;
- }
if (speed) {
speedControl.value = speed;
speedValue.textContent = `${speed}x`;
}
- // Check server status
+ // Check server status and populate voices from config
await checkServerStatus();
+ await populateVoices(voice);
// Set up event listeners
voiceSelect.addEventListener('change', saveVoicePreference);
@@ -105,6 +102,35 @@ function setServerStatus(status, text) {
serverStatus.querySelector('.status-text').textContent = text;
}
+/**
+ * Fetch voices from server and populate the dropdown.
+ * Falls back to the saved preference once options are built.
+ */
+async function populateVoices(savedVoice) {
+ try {
+ const response = await fetch(`${SERVER_URL}/voices`, {
+ signal: AbortSignal.timeout(3000)
+ });
+ const data = await response.json();
+ const voices = data.voices || [];
+ const defaultVoice = data.default || (voices[0] && voices[0].id);
+
+ voiceSelect.innerHTML = '';
+ for (const v of voices) {
+ const opt = document.createElement('option');
+ opt.value = v.id;
+ opt.textContent = v.name;
+ voiceSelect.appendChild(opt);
+ }
+
+ voiceSelect.value = savedVoice && voices.some(v => v.id === savedVoice)
+ ? savedVoice
+ : defaultVoice;
+ } catch {
+ // Server offline — leave dropdown empty, user will see disconnected status
+ }
+}
+
/**
* Save voice preference
*/
diff --git a/server/server.py b/server/server.py
index a1a4e4a..4aa834e 100644
--- a/server/server.py
+++ b/server/server.py
@@ -7,9 +7,12 @@
import io
import json
+import os
import re
import wave
import base64
+import safetensors
+import safetensors.torch
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
import numpy as np
@@ -21,8 +24,19 @@
_tts_model = None
_voice_states = {}
-# Available voices (these are the predefined catalog voices)
-AVAILABLE_VOICES = ["alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma"]
+_SERVER_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Load voice config from voices.json
+with open(os.path.join(_SERVER_DIR, "voices.json")) as _f:
+ _voices_config = json.load(_f)
+
+VOICES_LIST = _voices_config["voices"] # [{"id": ..., "name": ..., "path": ...}, ...]
+DEFAULT_VOICE = _voices_config.get("default", "alba")
+AVAILABLE_VOICES = [v["id"] for v in VOICES_LIST]
+CUSTOM_VOICE_PATHS = {
+ v["id"]: os.path.join(_SERVER_DIR, v["path"])
+ for v in VOICES_LIST if "path" in v
+}
SMART_QUOTE_MAP = str.maketrans({
@@ -48,14 +62,44 @@ def get_model():
return _tts_model
+def load_custom_voice_state(safetensors_path: str) -> dict:
+ """Load a pre-computed voice state from a .safetensors file.
+
+ 1. Calls init_states to register _module_absolute_name on every StatefulModule.
+ 2. Loads current_end as-is (tracks how many audio frames were processed).
+ 3. Copies saved cache into the first N slots of the fresh 1000-slot cache —
+ the saved cache may be smaller (e.g. 100 slots) and full, so we cannot
+ assign it directly; we need room for generation.
+ """
+ from pocket_tts.modules.stateful_module import init_states
+ model = get_model()
+ state = init_states(model.flow_lm, batch_size=1, sequence_length=1000)
+ with safetensors.safe_open(safetensors_path, framework="pt") as f:
+ for key in f.keys():
+ module_name, tensor_key = key.split("/", 1)
+ if module_name not in state:
+ continue
+ tensor = f.get_tensor(key)
+ if tensor_key == "cache":
+ # saved cache shape: [2, batch, n_slots, heads, d]
+ # fresh cache shape: [2, batch, 1000, heads, d]
+ n = tensor.shape[2]
+ state[module_name]["cache"][:, :, :n, :, :] = tensor
+ else:
+ state[module_name][tensor_key] = tensor
+ return state
+
+
def get_voice_state(voice_name: str):
"""Get or create a voice state for the given voice."""
global _voice_states
if voice_name not in _voice_states:
- model = get_model()
- # Use the voice name directly - pocket_tts handles the predefined voices
print(f"Loading voice: {voice_name}...")
- _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
+ if voice_name in CUSTOM_VOICE_PATHS:
+ _voice_states[voice_name] = load_custom_voice_state(CUSTOM_VOICE_PATHS[voice_name])
+ else:
+ model = get_model()
+ _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
print(f"Voice {voice_name} loaded!")
return _voice_states[voice_name]
@@ -164,8 +208,8 @@ def health():
def list_voices():
"""List available voices."""
return jsonify({
- "voices": AVAILABLE_VOICES,
- "default": "alba"
+ "voices": VOICES_LIST,
+ "default": DEFAULT_VOICE
})
@@ -222,16 +266,16 @@ def synthesize():
return jsonify({"error": "Missing 'text' field"}), 400
text = data['text']
- voice = data.get('voice', 'alba')
-
+ voice = data.get('voice', DEFAULT_VOICE)
+
if not text.strip():
return jsonify({"error": "Text cannot be empty"}), 400
text = normalize_smart_quotes(text)
-
+
if voice not in AVAILABLE_VOICES:
- voice = 'alba'
-
+ voice = DEFAULT_VOICE
+
try:
model = get_model()
voice_state = get_voice_state(voice)
@@ -274,7 +318,7 @@ def synthesize_stream():
return jsonify({"error": "Missing 'text' field"}), 400
text = data['text']
- voice = data.get('voice', 'alba')
+ voice = data.get('voice', DEFAULT_VOICE)
if not text.strip():
return jsonify({"error": "Text cannot be empty"}), 400
@@ -282,7 +326,7 @@ def synthesize_stream():
text = normalize_smart_quotes(text)
if voice not in AVAILABLE_VOICES:
- voice = 'alba'
+ voice = DEFAULT_VOICE
def generate_stream():
try:
diff --git a/server/voices.json b/server/voices.json
new file mode 100644
index 0000000..dcd33b7
--- /dev/null
+++ b/server/voices.json
@@ -0,0 +1,14 @@
+{
+ "voices": [
+ { "id": "alba", "name": "Alba" },
+ { "id": "marius", "name": "Marius" },
+ { "id": "javert", "name": "Javert" },
+ { "id": "jean", "name": "Jean" },
+ { "id": "fantine", "name": "Fantine" },
+ { "id": "cosette", "name": "Cosette" },
+ { "id": "eponine", "name": "Eponine" },
+ { "id": "azelma", "name": "Azelma" },
+ { "id": "example_custom", "name": "Example Custom", "path": "voices/example_custom.safetensors" }
+ ],
+ "default": "alba"
+}