diff --git a/CHANGELOG.md b/CHANGELOG.md index 21b31bb0a..1445423a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Fixes +- The MCP server no longer hangs on startup when the parser fails to load a language grammar. Previously a grammar-load crash could wedge the shared background daemon, so every editor session that connected to it hung waiting to initialize — on any project, large or small. The parser now falls back to in-process parsing instead of hanging, and a stuck startup reconcile can no longer block the first query indefinitely (tunable via `CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS`). - Indexing a project that contains only config-style files (YAML, Twig, or `.properties`) no longer misleadingly reports "No files found to index" — these files are tracked at the file level and are now counted as indexed. Thanks @luojiyin1987 (#357). ## [0.9.7] - 2026-05-28 diff --git a/__tests__/mcp-catchup-gate.test.ts b/__tests__/mcp-catchup-gate.test.ts index 6baee07c4..ae0152606 100644 --- a/__tests__/mcp-catchup-gate.test.ts +++ b/__tests__/mcp-catchup-gate.test.ts @@ -119,4 +119,21 @@ describe('MCP catch-up gate', () => { expect(res.isError).toBeFalsy(); expect(res.content[0].text).toMatch(/survivor/); }); + + it('a gate that never settles does not hang the first tool call (timeout fallback)', async () => { + // Defense in depth for the daemon init-hang: if catch-up never settles for + // any reason, the first call must still serve (best-effort) after the + // bounded wait instead of hanging forever. Tiny override keeps it fast. + process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS = '50'; + try { + handler.setCatchUpGate(new Promise(() => { /* never settles */ })); + const start = Date.now(); + const res = await handler.execute('codegraph_search', { query: 'survivor' }); + expect(Date.now() - start).toBeLessThan(5000); + expect(res.isError).toBeFalsy(); + expect(res.content[0].text).toMatch(/survivor/); + } finally { + delete process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS; + } + }); }); diff --git a/__tests__/parse-worker-grammar-load.test.ts b/__tests__/parse-worker-grammar-load.test.ts new file mode 100644 index 000000000..4087c0dea --- /dev/null +++ b/__tests__/parse-worker-grammar-load.test.ts @@ -0,0 +1,80 @@ +/** + * Regression test for the MCP daemon init-hang. + * + * `ensureWorker()` used to await a fresh parse worker's grammar-load with a bare + * `once('message')` that only resolved on `grammars-loaded`. If the worker DIED + * while loading grammars (a tree-sitter WASM abort), it never sent that message, + * so the await — and the in-process `indexMutex` it runs under — hung forever. + * In the shared daemon that wedged initialization for every connecting client. + * + * `awaitWorkerGrammarLoad` must settle (never hang) on every outcome: loaded, + * load-failed, worker error, worker exit, or timeout. + */ +import { EventEmitter } from 'node:events'; +import { describe, it, expect } from 'vitest'; +import { awaitWorkerGrammarLoad } from '../src/extraction/index'; + +class FakeWorker extends EventEmitter { + posted: unknown[] = []; + postMessage(msg: unknown): void { + this.posted.push(msg); + } +} + +const noLeakedListeners = (w: FakeWorker) => + w.listenerCount('message') === 0 && + w.listenerCount('error') === 0 && + w.listenerCount('exit') === 0; + +describe('awaitWorkerGrammarLoad — daemon init-hang regression', () => { + it('posts the load-grammars request and resolves on grammars-loaded', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + expect(w.posted).toEqual([{ type: 'load-grammars', languages: ['typescript'] }]); + w.emit('message', { type: 'grammars-loaded' }); + await expect(p).resolves.toBeUndefined(); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) on grammars-load-failed', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('message', { type: 'grammars-load-failed', error: 'bad wasm' }); + await expect(p).rejects.toThrow(/bad wasm/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) when the worker exits before grammars load — the WASM-abort case', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('exit', 1); + await expect(p).rejects.toThrow(/exited/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects (does not hang) when the worker emits an error', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('error', new Error('worker died')); + await expect(p).rejects.toThrow(/worker died/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('rejects on timeout when the worker goes silent — never hangs forever', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 50); + await expect(p).rejects.toThrow(/timed out/); + expect(noLeakedListeners(w)).toBe(true); + }); + + it('settles exactly once — a late worker exit after grammars-loaded is ignored', async () => { + const w = new FakeWorker(); + const p = awaitWorkerGrammarLoad(w as never, ['typescript'], 1000); + w.emit('message', { type: 'grammars-loaded' }); + await expect(p).resolves.toBeUndefined(); + // The promise already resolved; a later crash must not double-settle or + // throw an unhandled 'error' (listeners are gone, so emitting is a no-op). + expect(() => w.emit('exit', 1)).not.toThrow(); + expect(noLeakedListeners(w)).toBe(true); + }); +}); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 42037d7f6..98a679feb 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -39,6 +39,14 @@ const FILE_IO_BATCH_SIZE = 10; */ const PARSE_TIMEOUT_MS = 10_000; +/** + * Backstop for the one-time grammar-load handshake with a fresh worker. The + * worker normally answers `grammars-loaded` in well under a second; this only + * fires if it goes silent without crashing. Worker crash/exit during load is + * caught immediately (no need to wait this out) — see `ensureWorker`. + */ +const GRAMMAR_LOAD_TIMEOUT_MS = 30_000; + /** * Number of files to parse before recycling the worker thread. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation). @@ -48,6 +56,60 @@ const PARSE_TIMEOUT_MS = 10_000; */ const WORKER_RECYCLE_INTERVAL = 250; +/** + * Minimal shape of a parse worker needed for the grammar-load handshake. The + * real `worker_threads.Worker` satisfies it; tests pass a fake EventEmitter. + */ +export interface GrammarLoadWorker { + on(event: string, cb: (...args: any[]) => void): void; + off(event: string, cb: (...args: any[]) => void): void; + once(event: string, cb: (...args: any[]) => void): void; + postMessage(msg: unknown): void; +} + +/** + * Await a fresh worker's one-time grammar-load handshake, settling on the FIRST + * of: `grammars-loaded` (resolve) / `grammars-load-failed` / worker `error` / + * worker `exit` / timeout (all reject). Listeners are always removed and the + * timer cleared on settle. + * + * Regression guard (daemon-init-hang fix): the bare `once('message')` this + * replaced only resolved on `grammars-loaded` and never settled when the worker + * DIED loading grammars (a tree-sitter WASM abort). The caller's await — and the + * in-process `indexMutex` it runs under — then hung forever, which in a + * long-lived MCP daemon wedged initialization for every client that connected. + */ +export function awaitWorkerGrammarLoad( + worker: GrammarLoadWorker, + languages: Language[], + timeoutMs: number = GRAMMAR_LOAD_TIMEOUT_MS +): Promise { + return new Promise((resolve, reject) => { + let settled = false; + const finish = (err?: Error) => { + if (settled) return; + settled = true; + clearTimeout(timer); + worker.off('message', onMessage); + worker.off('error', onError); + worker.off('exit', onExit); + if (err) reject(err); else resolve(); + }; + const onMessage = (msg: { type: string; error?: string }) => { + if (msg.type === 'grammars-loaded') finish(); + else if (msg.type === 'grammars-load-failed') finish(new Error(`worker failed to load grammars: ${msg.error ?? 'unknown error'}`)); + }; + const onError = (err: Error) => finish(err); + const onExit = (code: number) => finish(new Error(`worker exited (code ${code}) before grammars loaded`)); + const timer = setTimeout(() => finish(new Error(`grammar load timed out after ${timeoutMs}ms`)), timeoutMs); + timer.unref?.(); + worker.on('message', onMessage); + worker.once('error', onError); + worker.once('exit', onExit); + worker.postMessage({ type: 'load-grammars', languages }); + }); +} + /** * Progress callback for indexing operations */ @@ -682,6 +744,22 @@ export class ExtractionOrchestrator { await loadGrammarsForLanguages(neededLanguages); } + // Set true if the worker becomes unusable mid-run (e.g. it crashed while + // loading grammars). From then on we parse in-process so a broken worker + // degrades to slower-but-correct parsing instead of hanging the run. + let workerDisabled = false; + // Grammars are already loaded on the main thread only when we never spawned + // a worker. The in-process fallback lazily loads them on first use. + let mainGrammarsLoaded = !useWorker; + + async function parseInProcess(filePath: string, content: string): Promise { + if (!mainGrammarsLoaded) { + await loadGrammarsForLanguages(neededLanguages); // idempotent — skips already-loaded + mainGrammarsLoaded = true; + } + return extractFromSource(filePath, content, detectLanguage(filePath, content), frameworkNames); + } + // --- Worker lifecycle management --- // The worker can crash (OOM in WASM) or hang on pathological files. // We track pending parse promises and handle both cases: @@ -738,23 +816,33 @@ export class ExtractionOrchestrator { async function ensureWorker(): Promise { if (parseWorker) return parseWorker; log('Spawning new parse worker...'); - parseWorker = new WorkerClass!(parseWorkerPath); - attachWorkerHandlers(parseWorker); - - // Load grammars in the new worker - await new Promise((resolve, reject) => { - parseWorker!.once('message', (msg: { type: string }) => { - if (msg.type === 'grammars-loaded') resolve(); - else reject(new Error(`Unexpected message: ${msg.type}`)); - }); - parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages }); - }); + const w = new WorkerClass!(parseWorkerPath); + parseWorker = w; + attachWorkerHandlers(w); + + // Wait for grammars to load — bounded, and rejects (never hangs) if the + // worker dies mid-load. See `awaitWorkerGrammarLoad`. + try { + await awaitWorkerGrammarLoad(w, neededLanguages); + } catch (err) { + // Worker is unusable — tear it down and let the caller degrade. + if (parseWorker === w) parseWorker = null; + try { void w.terminate(); } catch { /* ignore */ } + throw err instanceof Error ? err : new Error(String(err)); + } - return parseWorker; + return w; } if (WorkerClass) { - await ensureWorker(); + try { + await ensureWorker(); + } catch (err) { + workerDisabled = true; + logWarn('Parse worker failed to start — falling back to in-process parsing', { + error: err instanceof Error ? err.message : String(err), + }); + } } /** @@ -773,14 +861,8 @@ export class ExtractionOrchestrator { } async function requestParse(filePath: string, content: string): Promise { - if (!WorkerClass) { - // In-process fallback - return extractFromSource( - filePath, - content, - detectLanguage(filePath, content), - frameworkNames - ); + if (!WorkerClass || workerDisabled) { + return parseInProcess(filePath, content); } // Recycle the worker before the next parse if we've hit the threshold. @@ -790,7 +872,19 @@ export class ExtractionOrchestrator { await recycleWorker(); } - const worker = await ensureWorker(); + let worker: import('worker_threads').Worker; + try { + worker = await ensureWorker(); + } catch (err) { + // The worker died (e.g. crashed while loading grammars) — degrade to + // in-process parsing for the rest of this run rather than failing every + // file or hanging on a worker that will never answer. + workerDisabled = true; + logWarn('Parse worker unavailable — falling back to in-process parsing', { + error: err instanceof Error ? err.message : String(err), + }); + return parseInProcess(filePath, content); + } const id = nextId++; workerParseCount++; diff --git a/src/extraction/parse-worker.ts b/src/extraction/parse-worker.ts index 9b0010ec2..ecd5c5b34 100644 --- a/src/extraction/parse-worker.ts +++ b/src/extraction/parse-worker.ts @@ -57,8 +57,19 @@ const parseCounts = new Map(); parentPort!.on('message', async (msg: { type: string; id?: number; filePath?: string; content?: string; languages?: Language[]; frameworkNames?: string[] }) => { if (msg.type === 'load-grammars') { - await loadGrammarsForLanguages(msg.languages!); - parentPort!.postMessage({ type: 'grammars-loaded' }); + try { + await loadGrammarsForLanguages(msg.languages!); + parentPort!.postMessage({ type: 'grammars-loaded' }); + } catch (err) { + // Don't die silently: a JS-level grammar-load failure must be reported so + // the parent degrades to in-process parsing instead of waiting forever + // for a 'grammars-loaded' that will never come. (A hard WASM abort kills + // the worker outright — the parent's 'exit'/'error' guard covers that.) + parentPort!.postMessage({ + type: 'grammars-load-failed', + error: err instanceof Error ? err.message : String(err), + }); + } } else if (msg.type === 'parse') { const { id, filePath, content, frameworkNames } = msg; try { diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index 935481b8a..a60feb40b 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -255,6 +255,19 @@ function adaptiveExploreEnabled(): boolean { return process.env.CODEGRAPH_ADAPTIVE_EXPLORE !== '0' && process.env.CODEGRAPH_ADAPTIVE_EXPLORE !== 'false'; } +/** + * Upper bound (ms) on how long the first tool call waits for the catch-up gate + * before serving best-effort. Defense in depth: a catch-up sync that never + * settles must not hang the first call — and, in the shared daemon, every + * client. Override via `CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS`; default 120s (well + * above any real incremental reconcile). + */ +function catchUpGateTimeoutMs(): number { + const raw = process.env.CODEGRAPH_CATCHUP_GATE_TIMEOUT_MS; + const n = raw ? Number.parseInt(raw, 10) : NaN; + return Number.isFinite(n) && n > 0 ? n : 120_000; +} + /** * Prefix each line of a source slice with its 1-based line number, matching * the Read tool's `cat -n` convention (number + tab) so the agent treats it @@ -1042,7 +1055,18 @@ export class ToolHandler { if (this.catchUpGate) { const gate = this.catchUpGate; this.catchUpGate = null; - try { await gate; } catch { /* engine already logged */ } + // Bound the wait: a catch-up sync that never settles (for any reason) + // must not wedge the first call. On timeout we proceed best-effort over + // possibly-stale data — the same outcome the rejection-swallow accepts. + try { + await Promise.race([ + gate, + new Promise((resolve) => { + const t = setTimeout(resolve, catchUpGateTimeoutMs()); + t.unref?.(); + }), + ]); + } catch { /* engine already logged */ } } // Honor the optional tool allowlist (CODEGRAPH_MCP_TOOLS): a trimmed // surface rejects ablated tools defensively even if a client cached them.