lukasmwerner · lidork · Apr 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,30 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+.venv/
+*.egg-info/
+
+# UV lockfile (optional — commit if you want reproducible installs)
+# uv.lock
+
+# Custom voice embeddings — generated locally, not for distribution
+server/voices/*.safetensors
+server/voices/*.wav
+
+# Model weights cache (downloaded at runtime by pocket-tts)
+*.safetensors
+!server/voices.json
+
+# macOS
+.DS_Store
+
+# Editor
+.idea/
+.vscode/
+*.swp
+
+# Logs and temp output
+*.log
+output.wav
+/tmp/
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ A Chrome extension that reads web page content aloud using [Pocket TTS](https://
 
 - Read any web page content aloud
 - **Paragraph-by-paragraph processing** - audio starts playing quickly even for long documents
-- Multiple voice options (8 different voices)
+- Multiple voice options (8 built-in voices + support for custom cloned voices)
 - Automatic content extraction (focuses on main article content)
 - Simple playback controls (play/stop)
 - Works entirely locally - no cloud services required
@@ -98,6 +98,23 @@ Click "Stop" to stop playback at any time.
 - **Eponine** - Female voice
 - **Azelma** - Female voice
 
+## Adding a Custom Voice
+
+You can add your own cloned voice by placing a `.safetensors` embedding in `server/voices/` and registering it in `server/voices.json`:
+
+```json
+{ "id": "my_voice", "name": "My Voice", "path": "voices/my_voice.safetensors" }
+```
+
+Restart the server — the voice will appear in the extension dropdown automatically.
+
+To generate a `.safetensors` from a WAV sample (requires access to [kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts)):
+
+```bash
+cd server
+uv run ../create_voice_embedding.py voices/my_voice.wav voices/my_voice.safetensors
+```
+
 ## API Endpoints
 
 The TTS server provides the following endpoints:

diff --git a/extension/popup.html b/extension/popup.html
@@ -21,14 +21,7 @@ <h1>Pocket Reader</h1>
       <div class="control-group">
         <label for="voice-select">Voice</label>
         <select id="voice-select">
-          <option value="alba">Alba</option>
-          <option value="marius">Marius</option>
-          <option value="javert">Javert</option>
-          <option value="jean">Jean</option>
-          <option value="fantine">Fantine</option>
-          <option value="cosette">Cosette</option>
-          <option value="eponine">Eponine</option>
-          <option value="azelma">Azelma</option>
+          <!-- populated dynamically from /voices -->
         </select>
       </div>
 

diff --git a/extension/popup.js b/extension/popup.js
@@ -33,18 +33,15 @@ let scannedParagraphs = [];
  * Initialize popup
  */
 async function init() {
-  // Load saved voice preference
   const { voice, speed } = await chrome.storage.local.get(['voice', 'speed']);
-  if (voice) {
-    voiceSelect.value = voice;
-  }
   if (speed) {
     speedControl.value = speed;
     speedValue.textContent = `${speed}x`;
   }
 
-  // Check server status
+  // Check server status and populate voices from config
   await checkServerStatus();
+  await populateVoices(voice);
 
   // Set up event listeners
   voiceSelect.addEventListener('change', saveVoicePreference);
@@ -105,6 +102,35 @@ function setServerStatus(status, text) {
   serverStatus.querySelector('.status-text').textContent = text;
 }
 
+/**
+ * Fetch voices from server and populate the dropdown.
+ * Falls back to the saved preference once options are built.
+ */
+async function populateVoices(savedVoice) {
+  try {
+    const response = await fetch(`${SERVER_URL}/voices`, {
+      signal: AbortSignal.timeout(3000)
+    });
+    const data = await response.json();
+    const voices = data.voices || [];
+    const defaultVoice = data.default || (voices[0] && voices[0].id);
+
+    voiceSelect.innerHTML = '';
+    for (const v of voices) {
+      const opt = document.createElement('option');
+      opt.value = v.id;
+      opt.textContent = v.name;
+      voiceSelect.appendChild(opt);
+    }
+
+    voiceSelect.value = savedVoice && voices.some(v => v.id === savedVoice)
+      ? savedVoice
+      : defaultVoice;
+  } catch {
+    // Server offline — leave dropdown empty, user will see disconnected status
+  }
+}
+
 /**
  * Save voice preference
  */

diff --git a/server/server.py b/server/server.py
@@ -7,9 +7,12 @@
 
 import io
 import json
+import os
 import re
 import wave
 import base64
+import safetensors
+import safetensors.torch
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import numpy as np
@@ -21,8 +24,19 @@
 _tts_model = None
 _voice_states = {}
 
-# Available voices (these are the predefined catalog voices)
-AVAILABLE_VOICES = ["alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma"]
+_SERVER_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Load voice config from voices.json
+with open(os.path.join(_SERVER_DIR, "voices.json")) as _f:
+    _voices_config = json.load(_f)
+
+VOICES_LIST = _voices_config["voices"]  # [{"id": ..., "name": ..., "path": ...}, ...]
+DEFAULT_VOICE = _voices_config.get("default", "alba")
+AVAILABLE_VOICES = [v["id"] for v in VOICES_LIST]
+CUSTOM_VOICE_PATHS = {
+    v["id"]: os.path.join(_SERVER_DIR, v["path"])
+    for v in VOICES_LIST if "path" in v
+}
 
 
 SMART_QUOTE_MAP = str.maketrans({
@@ -48,14 +62,44 @@ def get_model():
     return _tts_model
 
 
+def load_custom_voice_state(safetensors_path: str) -> dict:
+    """Load a pre-computed voice state from a .safetensors file.
+
+    1. Calls init_states to register _module_absolute_name on every StatefulModule.
+    2. Loads current_end as-is (tracks how many audio frames were processed).
+    3. Copies saved cache into the first N slots of the fresh 1000-slot cache —
+       the saved cache may be smaller (e.g. 100 slots) and full, so we cannot
+       assign it directly; we need room for generation.
+    """
+    from pocket_tts.modules.stateful_module import init_states
+    model = get_model()
+    state = init_states(model.flow_lm, batch_size=1, sequence_length=1000)
+    with safetensors.safe_open(safetensors_path, framework="pt") as f:
+        for key in f.keys():
+            module_name, tensor_key = key.split("/", 1)
+            if module_name not in state:
+                continue
+            tensor = f.get_tensor(key)
+            if tensor_key == "cache":
+                # saved cache shape: [2, batch, n_slots, heads, d]
+                # fresh cache shape: [2, batch, 1000, heads, d]
+                n = tensor.shape[2]
+                state[module_name]["cache"][:, :, :n, :, :] = tensor
+            else:
+                state[module_name][tensor_key] = tensor
+    return state
+
+
 def get_voice_state(voice_name: str):
     """Get or create a voice state for the given voice."""
     global _voice_states
     if voice_name not in _voice_states:
-        model = get_model()
-        # Use the voice name directly - pocket_tts handles the predefined voices
         print(f"Loading voice: {voice_name}...")
-        _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
+        if voice_name in CUSTOM_VOICE_PATHS:
+            _voice_states[voice_name] = load_custom_voice_state(CUSTOM_VOICE_PATHS[voice_name])
+        else:
+            model = get_model()
+            _voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
         print(f"Voice {voice_name} loaded!")
     return _voice_states[voice_name]
 
@@ -164,8 +208,8 @@ def health():
 def list_voices():
     """List available voices."""
     return jsonify({
-        "voices": AVAILABLE_VOICES,
-        "default": "alba"
+        "voices": VOICES_LIST,
+        "default": DEFAULT_VOICE
     })
 
 
@@ -222,16 +266,16 @@ def synthesize():
         return jsonify({"error": "Missing 'text' field"}), 400
 
     text = data['text']
-    voice = data.get('voice', 'alba')
-    
+    voice = data.get('voice', DEFAULT_VOICE)
+
     if not text.strip():
         return jsonify({"error": "Text cannot be empty"}), 400
 
     text = normalize_smart_quotes(text)
-    
+
     if voice not in AVAILABLE_VOICES:
-        voice = 'alba'
-    
+        voice = DEFAULT_VOICE
+
     try:
         model = get_model()
         voice_state = get_voice_state(voice)
@@ -274,15 +318,15 @@ def synthesize_stream():
         return jsonify({"error": "Missing 'text' field"}), 400
 
     text = data['text']
-    voice = data.get('voice', 'alba')
+    voice = data.get('voice', DEFAULT_VOICE)
 
     if not text.strip():
         return jsonify({"error": "Text cannot be empty"}), 400
 
     text = normalize_smart_quotes(text)
 
     if voice not in AVAILABLE_VOICES:
-        voice = 'alba'
+        voice = DEFAULT_VOICE
 
     def generate_stream():
         try:

diff --git a/server/voices.json b/server/voices.json
@@ -0,0 +1,14 @@
+{
+  "voices": [
+    { "id": "alba",       "name": "Alba" },
+    { "id": "marius",     "name": "Marius" },
+    { "id": "javert",     "name": "Javert" },
+    { "id": "jean",       "name": "Jean" },
+    { "id": "fantine",    "name": "Fantine" },
+    { "id": "cosette",    "name": "Cosette" },
+    { "id": "eponine",    "name": "Eponine" },
+    { "id": "azelma",     "name": "Azelma" },
+    { "id": "example_custom", "name": "Example Custom", "path": "voices/example_custom.safetensors" }
+  ],
+  "default": "alba"
+}