Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Python
__pycache__/
*.py[cod]
*.pyo
.venv/
*.egg-info/

# UV lockfile (optional — commit if you want reproducible installs)
# uv.lock

# Custom voice embeddings — generated locally, not for distribution
server/voices/*.safetensors
server/voices/*.wav

# Model weights cache (downloaded at runtime by pocket-tts)
*.safetensors
!server/voices.json

# macOS
.DS_Store

# Editor
.idea/
.vscode/
*.swp

# Logs and temp output
*.log
output.wav
/tmp/
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ A Chrome extension that reads web page content aloud using [Pocket TTS](https://

- Read any web page content aloud
- **Paragraph-by-paragraph processing** - audio starts playing quickly even for long documents
- Multiple voice options (8 different voices)
- Multiple voice options (8 built-in voices + support for custom cloned voices)
- Automatic content extraction (focuses on main article content)
- Simple playback controls (play/stop)
- Works entirely locally - no cloud services required
Expand Down Expand Up @@ -98,6 +98,23 @@ Click "Stop" to stop playback at any time.
- **Eponine** - Female voice
- **Azelma** - Female voice

## Adding a Custom Voice

You can add your own cloned voice by placing a `.safetensors` embedding in `server/voices/` and registering it in `server/voices.json`:

```json
{ "id": "my_voice", "name": "My Voice", "path": "voices/my_voice.safetensors" }
```

Restart the server — the voice will appear in the extension dropdown automatically.

To generate a `.safetensors` from a WAV sample (requires access to [kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts)):

```bash
cd server
uv run ../create_voice_embedding.py voices/my_voice.wav voices/my_voice.safetensors
```

## API Endpoints

The TTS server provides the following endpoints:
Expand Down
9 changes: 1 addition & 8 deletions extension/popup.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,7 @@ <h1>Pocket Reader</h1>
<div class="control-group">
<label for="voice-select">Voice</label>
<select id="voice-select">
<option value="alba">Alba</option>
<option value="marius">Marius</option>
<option value="javert">Javert</option>
<option value="jean">Jean</option>
<option value="fantine">Fantine</option>
<option value="cosette">Cosette</option>
<option value="eponine">Eponine</option>
<option value="azelma">Azelma</option>
<!-- populated dynamically from /voices -->
</select>
</div>

Expand Down
36 changes: 31 additions & 5 deletions extension/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,15 @@ let scannedParagraphs = [];
* Initialize popup
*/
async function init() {
// Load saved voice preference
const { voice, speed } = await chrome.storage.local.get(['voice', 'speed']);
if (voice) {
voiceSelect.value = voice;
}
if (speed) {
speedControl.value = speed;
speedValue.textContent = `${speed}x`;
}

// Check server status
// Check server status and populate voices from config
await checkServerStatus();
await populateVoices(voice);

// Set up event listeners
voiceSelect.addEventListener('change', saveVoicePreference);
Expand Down Expand Up @@ -105,6 +102,35 @@ function setServerStatus(status, text) {
serverStatus.querySelector('.status-text').textContent = text;
}

/**
* Fetch voices from server and populate the dropdown.
* Falls back to the saved preference once options are built.
*/
async function populateVoices(savedVoice) {
try {
const response = await fetch(`${SERVER_URL}/voices`, {
signal: AbortSignal.timeout(3000)
});
const data = await response.json();
const voices = data.voices || [];
const defaultVoice = data.default || (voices[0] && voices[0].id);

voiceSelect.innerHTML = '';
for (const v of voices) {
const opt = document.createElement('option');
opt.value = v.id;
opt.textContent = v.name;
voiceSelect.appendChild(opt);
}

voiceSelect.value = savedVoice && voices.some(v => v.id === savedVoice)
? savedVoice
: defaultVoice;
} catch {
// Server offline — leave dropdown empty, user will see disconnected status
}
}

/**
* Save voice preference
*/
Expand Down
72 changes: 58 additions & 14 deletions server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@

import io
import json
import os
import re
import wave
import base64
import safetensors
import safetensors.torch
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
import numpy as np
Expand All @@ -21,8 +24,19 @@
_tts_model = None
_voice_states = {}

# Available voices (these are the predefined catalog voices)
AVAILABLE_VOICES = ["alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma"]
_SERVER_DIR = os.path.dirname(os.path.abspath(__file__))

# Load voice config from voices.json
with open(os.path.join(_SERVER_DIR, "voices.json")) as _f:
_voices_config = json.load(_f)

VOICES_LIST = _voices_config["voices"] # [{"id": ..., "name": ..., "path": ...}, ...]
DEFAULT_VOICE = _voices_config.get("default", "alba")
AVAILABLE_VOICES = [v["id"] for v in VOICES_LIST]
CUSTOM_VOICE_PATHS = {
v["id"]: os.path.join(_SERVER_DIR, v["path"])
for v in VOICES_LIST if "path" in v
}


SMART_QUOTE_MAP = str.maketrans({
Expand All @@ -48,14 +62,44 @@ def get_model():
return _tts_model


def load_custom_voice_state(safetensors_path: str) -> dict:
"""Load a pre-computed voice state from a .safetensors file.

1. Calls init_states to register _module_absolute_name on every StatefulModule.
2. Loads current_end as-is (tracks how many audio frames were processed).
3. Copies saved cache into the first N slots of the fresh 1000-slot cache —
the saved cache may be smaller (e.g. 100 slots) and full, so we cannot
assign it directly; we need room for generation.
"""
from pocket_tts.modules.stateful_module import init_states
model = get_model()
state = init_states(model.flow_lm, batch_size=1, sequence_length=1000)
with safetensors.safe_open(safetensors_path, framework="pt") as f:
for key in f.keys():
module_name, tensor_key = key.split("/", 1)
if module_name not in state:
continue
tensor = f.get_tensor(key)
if tensor_key == "cache":
# saved cache shape: [2, batch, n_slots, heads, d]
# fresh cache shape: [2, batch, 1000, heads, d]
n = tensor.shape[2]
state[module_name]["cache"][:, :, :n, :, :] = tensor
else:
state[module_name][tensor_key] = tensor
return state


def get_voice_state(voice_name: str):
"""Get or create a voice state for the given voice."""
global _voice_states
if voice_name not in _voice_states:
model = get_model()
# Use the voice name directly - pocket_tts handles the predefined voices
print(f"Loading voice: {voice_name}...")
_voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
if voice_name in CUSTOM_VOICE_PATHS:
_voice_states[voice_name] = load_custom_voice_state(CUSTOM_VOICE_PATHS[voice_name])
else:
model = get_model()
_voice_states[voice_name] = model.get_state_for_audio_prompt(voice_name)
print(f"Voice {voice_name} loaded!")
return _voice_states[voice_name]

Expand Down Expand Up @@ -164,8 +208,8 @@ def health():
def list_voices():
"""List available voices."""
return jsonify({
"voices": AVAILABLE_VOICES,
"default": "alba"
"voices": VOICES_LIST,
"default": DEFAULT_VOICE
})


Expand Down Expand Up @@ -222,16 +266,16 @@ def synthesize():
return jsonify({"error": "Missing 'text' field"}), 400

text = data['text']
voice = data.get('voice', 'alba')
voice = data.get('voice', DEFAULT_VOICE)

if not text.strip():
return jsonify({"error": "Text cannot be empty"}), 400

text = normalize_smart_quotes(text)

if voice not in AVAILABLE_VOICES:
voice = 'alba'
voice = DEFAULT_VOICE

try:
model = get_model()
voice_state = get_voice_state(voice)
Expand Down Expand Up @@ -274,15 +318,15 @@ def synthesize_stream():
return jsonify({"error": "Missing 'text' field"}), 400

text = data['text']
voice = data.get('voice', 'alba')
voice = data.get('voice', DEFAULT_VOICE)

if not text.strip():
return jsonify({"error": "Text cannot be empty"}), 400

text = normalize_smart_quotes(text)

if voice not in AVAILABLE_VOICES:
voice = 'alba'
voice = DEFAULT_VOICE

def generate_stream():
try:
Expand Down
14 changes: 14 additions & 0 deletions server/voices.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"voices": [
{ "id": "alba", "name": "Alba" },
{ "id": "marius", "name": "Marius" },
{ "id": "javert", "name": "Javert" },
{ "id": "jean", "name": "Jean" },
{ "id": "fantine", "name": "Fantine" },
{ "id": "cosette", "name": "Cosette" },
{ "id": "eponine", "name": "Eponine" },
{ "id": "azelma", "name": "Azelma" },
{ "id": "example_custom", "name": "Example Custom", "path": "voices/example_custom.safetensors" }
],
"default": "alba"
}