dictate/dictate.py at main · thomaspeklak/dictate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "faster-whisper>=1.0.0",
#     "pyaudio>=0.2.13",
#     "numpy>=1.24.0",
# ]
# ///
"""Dictate - Speech-to-text with agent processing.

Toggle recording with keyboard shortcut (via GNOME keybinding).
First invocation starts recording, second stops and processes.
"""

import argparse
import os
import signal
import sys
import time

PID_FILE = "/tmp/dictate.pid"


def parse_args() -> argparse.Namespace:
    """Parse command-line arguments."""
    parser = argparse.ArgumentParser(
        description="Speech-to-text with agent processing"
    )
    parser.add_argument(
        "-l", "--language",
        default=None,
        help="Language code (e.g., 'de' for German). Default: en",
    )
    parser.add_argument(
        "-r", "--raw",
        action="store_true",
        help="Skip Claude processing, output raw transcription",
    )
    parser.add_argument(
        "-t", "--type",
        action="store_true",
        help="Type text at cursor (in addition to clipboard)",
    )
    return parser.parse_args()


def is_running() -> int | None:
    """Check if another instance is recording.

    Returns:
        PID of running instance, or None if not running.
    """
    if os.path.exists(PID_FILE):
        try:
            with open(PID_FILE) as f:
                pid = int(f.read().strip())
            # Check if process exists
            os.kill(pid, 0)
            return pid
        except (OSError, ValueError):
            # Process doesn't exist or invalid PID - clean up stale file
            try:
                os.unlink(PID_FILE)
            except OSError:
                pass
    return None


def cleanup_pid_file() -> None:
    """Remove PID file."""
    try:
        os.unlink(PID_FILE)
    except OSError:
        pass


def main() -> int:
    """Main entry point."""
    args = parse_args()

    # Check if another instance is running
    existing_pid = is_running()
    if existing_pid:
        # Signal existing process to stop recording
        try:
            os.kill(existing_pid, signal.SIGUSR1)
        except OSError:
            pass
        return 0

    # Import here to avoid slow startup when just signaling
    from dictate.agent import Agent
    from dictate.clipboard import copy_to_clipboard, type_text
    from dictate.config import Config
    from dictate.notifier import Notifier
    from dictate.recorder import Recorder
    from dictate.transcriber import Transcriber

    # Write PID file
    with open(PID_FILE, "w") as f:
        f.write(str(os.getpid()))

    # Track if we should stop
    should_stop = False

    def handle_stop_signal(signum, frame):
        nonlocal should_stop
        should_stop = True

    signal.signal(signal.SIGUSR1, handle_stop_signal)

    recorder = None
    try:
        # Load configuration
        config = Config.load()

        # Reset notification ID for fresh session
        Notifier.reset()

        # Start recording
        Notifier.recording()
        recorder = Recorder()
        recorder.start(device_index=config.audio_device)

        # Wait for stop signal
        while not should_stop:
            time.sleep(0.1)

        # Stop recording and get audio
        audio = recorder.stop()

        # Check if recording is too short
        if len(audio) < 16000:  # Less than 1 second at 16kHz
            Notifier.error("Recording too short")
            return 1

        # Transcribe
        Notifier.transcribing()
        transcriber = Transcriber(
            model_size=config.model_size,
            device=config.device,
            compute_type=config.compute_type,
        )
        language = args.language or config.language
        text = transcriber.transcribe(audio, language=language)

        if not text.strip():
            Notifier.error("No speech detected")
            return 1

        # Process with Claude (unless --raw)
        if args.raw:
            processed = text
        else:
            Notifier.processing()
            agent = Agent(prompt_template=config.prompt_template)
            try:
                processed = agent.process(text)
            except Exception as e:
                # Fallback to raw transcription if Claude fails
                processed = text
                Notifier.notify("Warning", f"Claude failed, using raw text: {e}", "dialog-warning")

        # Copy to clipboard
        copy_to_clipboard(processed)
        if args.type:
            type_text(processed)
        Notifier.done(processed)

        return 0

    except Exception as e:
        Notifier.error(str(e))
        return 1

    finally:
        if recorder:
            recorder.terminate()
        cleanup_pid_file()


if __name__ == "__main__":
    sys.exit(main())