diff --git a/.changeset/mcp-error-categorization.md b/.changeset/mcp-error-categorization.md new file mode 100644 index 0000000000..25676853df --- /dev/null +++ b/.changeset/mcp-error-categorization.md @@ -0,0 +1,5 @@ +--- +'@hyperdx/api': patch +--- + +Classify MCP tool errors as `user` (bad input, not-found) or `server` (infrastructure failure, timeout) so alerting rules can filter on `error_category=server` without noise from agent input mistakes. Adds `error_category` attribute to spans and the `hyperdx.mcp.tool.errors` metric counter. ClickHouse errors are auto-classified by inspecting the error type and walking the cause chain for TCP-level codes. diff --git a/packages/api/package.json b/packages/api/package.json index 328e210910..3f36cccf66 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -10,6 +10,7 @@ "@ai-sdk/anthropic": "^3.0.58", "@ai-sdk/openai": "^3.0.47", "@braintree/sanitize-url": "^7.1.1", + "@clickhouse/client-common": "1.23.0-head.fae5998.1", "@esm2cjs/p-queue": "^7.3.0", "@hyperdx/common-utils": "^0.21.0", "@hyperdx/node-opentelemetry": "^0.9.0", diff --git a/packages/api/src/mcp/__tests__/query.test.ts b/packages/api/src/mcp/__tests__/query.test.ts index bd3f00fe0b..df27649d85 100644 --- a/packages/api/src/mcp/__tests__/query.test.ts +++ b/packages/api/src/mcp/__tests__/query.test.ts @@ -6,11 +6,16 @@ jest.mock('@/utils/trimToolResponse', () => ({ trimToolResponse: (data: unknown) => ({ data, isTrimmed: false }), })); +import { ClickHouseError } from '@clickhouse/client-common'; + import { annotateIncreaseTopNHint, assertSourceKindMatchesSelect, + clickHouseErrorResult, errorHint, + getClickHouseErrorType, INCREASE_TOP_N_CAP, + isServerError, mergeWhereIntoSelectItems, parseTimeRange, } from '@/mcp/tools/query/helpers'; @@ -20,6 +25,7 @@ import { validateMetricSelectItems, } from '@/mcp/tools/query/schemas'; import { resolveOrderBy } from '@/mcp/tools/query/table'; +import { getErrorCategory } from '@/mcp/utils/errors'; describe('parseTimeRange', () => { it('should return default range (last 15 minutes) when no arguments provided', () => { @@ -261,6 +267,306 @@ describe('errorHint', () => { }); }); +// ─── getClickHouseErrorType ────────────────────────────────────────────────── + +describe('getClickHouseErrorType', () => { + it('returns undefined for a non-Error value', () => { + expect(getClickHouseErrorType('NETWORK_ERROR')).toBeUndefined(); + expect(getClickHouseErrorType(null)).toBeUndefined(); + expect(getClickHouseErrorType(undefined)).toBeUndefined(); + expect(getClickHouseErrorType({ type: 'NETWORK_ERROR' })).toBeUndefined(); + }); + + it('returns the type from .cause when present (wrapped ClickHouseError)', () => { + const cause = new ClickHouseError({ + message: 'timeout', + code: '159', + type: 'SOCKET_TIMEOUT', + }); + const err = new Error('query failed', { cause }); + expect(getClickHouseErrorType(err)).toBe('SOCKET_TIMEOUT'); + }); + + it('returns the type from a direct ClickHouseError', () => { + const err = new ClickHouseError({ + message: 'direct', + code: '62', + type: 'SYNTAX_ERROR', + }); + expect(getClickHouseErrorType(err)).toBe('SYNTAX_ERROR'); + }); + + it('returns .cause type when wrapper carries a ClickHouseError cause', () => { + const cause = new ClickHouseError({ + message: 'inner', + code: '210', + type: 'NETWORK_ERROR', + }); + const err = new Error('outer wrapper', { cause }); + expect(getClickHouseErrorType(err)).toBe('NETWORK_ERROR'); + }); + + it('returns undefined when neither .cause nor the error is a ClickHouseError', () => { + expect(getClickHouseErrorType(new Error('plain'))).toBeUndefined(); + }); + + it('ignores non-ClickHouseError objects with a .type property', () => { + const err = Object.assign(new Error('not CH'), { type: 'NETWORK_ERROR' }); + expect(getClickHouseErrorType(err)).toBeUndefined(); + }); + + it('ignores .cause that is a plain object with a type (not ClickHouseError)', () => { + const err = new Error('no CH cause', { + cause: { type: 'SOCKET_TIMEOUT' }, + }); + expect(getClickHouseErrorType(err)).toBeUndefined(); + }); + + it('detects a cross-package ClickHouseError via constructor-name fallback', () => { + // Simulate a ClickHouseError from a different copy of + // @clickhouse/client-common where instanceof would fail. + class ClickHouseError extends Error { + type: string; + code: string; + constructor(opts: { message: string; code: string; type: string }) { + super(opts.message); + this.name = 'ClickHouseError'; + this.type = opts.type; + this.code = opts.code; + } + } + const cause = new ClickHouseError({ + message: 'network', + code: '210', + type: 'NETWORK_ERROR', + }); + const err = new Error('query failed', { cause }); + expect(getClickHouseErrorType(err)).toBe('NETWORK_ERROR'); + }); +}); + +// ─── isServerError ─────────────────────────────────────────────────────────── + +describe('isServerError', () => { + it('returns true for ClickHouseError with a server-side type', () => { + const cause = new ClickHouseError({ + message: 'network', + code: '210', + type: 'NETWORK_ERROR', + }); + const err = new Error('query failed', { cause }); + expect(isServerError(err)).toBe(true); + }); + + it('returns false for ClickHouseError with a query-level type', () => { + const cause = new ClickHouseError({ + message: 'syntax', + code: '62', + type: 'SYNTAX_ERROR', + }); + const err = new Error('query failed', { cause }); + expect(isServerError(err)).toBe(false); + }); + + it('returns true for ECONNREFUSED (TCP connection failure)', () => { + const err = Object.assign(new Error('connect ECONNREFUSED 127.0.0.1:1'), { + code: 'ECONNREFUSED', + }); + expect(isServerError(err)).toBe(true); + }); + + it('returns true for ENOTFOUND (DNS failure)', () => { + const err = Object.assign( + new Error('getaddrinfo ENOTFOUND bogus.invalid'), + { code: 'ENOTFOUND' }, + ); + expect(isServerError(err)).toBe(true); + }); + + it('returns true for ECONNRESET', () => { + const err = Object.assign(new Error('socket hang up'), { + code: 'ECONNRESET', + }); + expect(isServerError(err)).toBe(true); + }); + + it('returns true for ETIMEDOUT', () => { + const err = Object.assign(new Error('connect timed out'), { + code: 'ETIMEDOUT', + }); + expect(isServerError(err)).toBe(true); + }); + + it('returns true when TCP error is wrapped in .cause', () => { + const cause = Object.assign(new Error('connect ECONNREFUSED 127.0.0.1:1'), { + code: 'ECONNREFUSED', + }); + const err = new Error('request failed', { cause }); + expect(isServerError(err)).toBe(true); + }); + + it('returns false for a plain Error with no code', () => { + expect(isServerError(new Error('something failed'))).toBe(false); + }); + + it('returns false for non-Error values', () => { + expect(isServerError('ECONNREFUSED')).toBe(false); + expect(isServerError(null)).toBe(false); + }); + + it('returns true for ClickHouseError nested at depth 2+', () => { + const chErr = new ClickHouseError({ + message: 'network', + code: '210', + type: 'NETWORK_ERROR', + }); + const mid = new Error('mid-level wrapper', { cause: chErr }); + const outer = new Error('outer wrapper', { cause: mid }); + expect(isServerError(outer)).toBe(true); + }); + + it('detects a cross-package ClickHouseError via constructor-name fallback', () => { + // Locally-declared class — instanceof against the imported + // ClickHouseError will fail, exercising the name-based fallback. + class ClickHouseError extends Error { + type: string; + code: string; + constructor(opts: { message: string; code: string; type: string }) { + super(opts.message); + this.name = 'ClickHouseError'; + this.type = opts.type; + this.code = opts.code; + } + } + const cause = new ClickHouseError({ + message: 'network', + code: '210', + type: 'NETWORK_ERROR', + }); + const err = new Error('query failed', { cause }); + expect(isServerError(err)).toBe(true); + }); + + it('returns true for AggregateError containing a TCP error', () => { + const tcpErr = Object.assign( + new Error('connect ECONNREFUSED 127.0.0.1:1'), + { code: 'ECONNREFUSED' }, + ); + const aggErr = new AggregateError([tcpErr], 'all connections failed'); + expect(isServerError(aggErr)).toBe(true); + }); + + it('handles circular .cause without infinite loop', () => { + const a = new Error('a'); + const b = new Error('b', { cause: a }); + // Create circular reference: a.cause -> b -> a -> ... + (a as any).cause = b; + // Should terminate and return false (no server error codes) + expect(isServerError(a)).toBe(false); + }); +}); + +// ─── clickHouseErrorResult ─────────────────────────────────────────────────── + +describe('clickHouseErrorResult', () => { + it('classifies infrastructure error types as "server"', () => { + for (const type of [ + 'NETWORK_ERROR', + 'SOCKET_TIMEOUT', + 'POCO_EXCEPTION', + 'ALL_CONNECTION_TRIES_FAILED', + ]) { + const cause = new ClickHouseError({ + message: 'infra down', + code: '210', + type, + }); + const err = new Error('infra down', { cause }); + const result = clickHouseErrorResult(err); + expect(result.isError).toBe(true); + expect(getErrorCategory(result)).toBe('server'); + } + }); + + it('classifies query/syntax errors as "user"', () => { + const cause = new ClickHouseError({ + message: 'bad query', + code: '62', + type: 'SYNTAX_ERROR', + }); + const err = new Error('bad query', { cause }); + const result = clickHouseErrorResult(err); + expect(getErrorCategory(result)).toBe('user'); + }); + + it('defaults to "user" when no error type is available', () => { + const result = clickHouseErrorResult(new Error('mystery')); + expect(getErrorCategory(result)).toBe('user'); + }); + + it('classifies a non-Error value as "user"', () => { + const result = clickHouseErrorResult('some string failure'); + expect(getErrorCategory(result)).toBe('user'); + expect(result.content[0].text).toContain('some string failure'); + }); + + it('prepends a string prefix to the message', () => { + const result = clickHouseErrorResult( + new Error('boom'), + 'Failed to sample rows', + ); + expect(result.content[0].text).toBe('Failed to sample rows: boom'); + }); + + it('supports prefix and suffix via the object form', () => { + const result = clickHouseErrorResult(new Error('boom'), { + prefix: 'Failed to compute breakdown', + suffix: 'The parentFilter must be valid ClickHouse SQL.', + }); + expect(result.content[0].text).toBe( + 'Failed to compute breakdown: boom The parentFilter must be valid ClickHouse SQL.', + ); + }); + + it('preserves categorization regardless of message context', () => { + const cause = new ClickHouseError({ + message: 'infra down', + code: '210', + type: 'NETWORK_ERROR', + }); + const err = new Error('infra down', { cause }); + const result = clickHouseErrorResult(err, 'Failed to sample rows'); + expect(getErrorCategory(result)).toBe('server'); + }); + + it('appends the DateTime64 hint and still classifies as "user"', () => { + const err = new Error("Cannot parse string '2024Z' as DateTime64(9)"); + const result = clickHouseErrorResult(err); + expect(result.content[0].text).toContain('HINT:'); + expect(getErrorCategory(result)).toBe('user'); + }); + + it('classifies ECONNREFUSED as "server"', () => { + const err = Object.assign( + new Error('connect ECONNREFUSED 127.0.0.1:19999'), + { code: 'ECONNREFUSED' }, + ); + const result = clickHouseErrorResult(err); + expect(result.isError).toBe(true); + expect(getErrorCategory(result)).toBe('server'); + }); + + it('classifies ENOTFOUND as "server"', () => { + const err = Object.assign( + new Error('getaddrinfo ENOTFOUND bogus.invalid'), + { code: 'ENOTFOUND' }, + ); + const result = clickHouseErrorResult(err); + expect(result.isError).toBe(true); + expect(getErrorCategory(result)).toBe('server'); + }); +}); + // ─── resolveOrderBy ────────────────────────────────────────────────────────── describe('resolveOrderBy', () => { diff --git a/packages/api/src/mcp/__tests__/queryTool.test.ts b/packages/api/src/mcp/__tests__/queryTool.test.ts index 87e83fbf9c..35f405fdc9 100644 --- a/packages/api/src/mcp/__tests__/queryTool.test.ts +++ b/packages/api/src/mcp/__tests__/queryTool.test.ts @@ -692,6 +692,143 @@ describe('MCP Query Tools', () => { }); }); + // ─── ClickHouse query errors ────────────────────────────────────────────────── + + describe('ClickHouse query error handling', () => { + describe('clickstack_sql errors', () => { + it('should return isError for a syntax error', async () => { + const result = await callTool(client, 'clickstack_sql', { + connectionId: connection._id.toString(), + sql: 'SELECTT 1', + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result)).toMatch( + /SYNTAX_ERROR|Syntax error|unknown/i, + ); + }); + + it('should return isError for a nonexistent column', async () => { + const result = await callTool(client, 'clickstack_sql', { + connectionId: connection._id.toString(), + sql: `SELECT nonexistent_column_xyz FROM ${DEFAULT_DATABASE}.${DEFAULT_LOGS_TABLE} LIMIT 1`, + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result)).toMatch( + /UNKNOWN_IDENTIFIER|Missing columns|unknown/i, + ); + }); + + it('should return isError for a nonexistent table', async () => { + const result = await callTool(client, 'clickstack_sql', { + connectionId: connection._id.toString(), + sql: `SELECT 1 FROM ${DEFAULT_DATABASE}.nonexistent_table_xyz LIMIT 1`, + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result)).toMatch( + /UNKNOWN_TABLE|doesn.t exist|unknown/i, + ); + }); + }); + + describe('clickstack_table errors', () => { + it('should return isError for a nonexistent valueExpression column', async () => { + const result = await callTool(client, 'clickstack_table', { + sourceId: traceSource._id.toString(), + select: [{ aggFn: 'avg', valueExpression: 'nonexistent_column_xyz' }], + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result)).toMatch( + /UNKNOWN_IDENTIFIER|Missing columns|unknown/i, + ); + }); + }); + + describe('clickstack_timeseries errors', () => { + it('should return isError for a nonexistent valueExpression column', async () => { + const result = await callTool(client, 'clickstack_timeseries', { + sourceId: traceSource._id.toString(), + select: [{ aggFn: 'sum', valueExpression: 'nonexistent_column_xyz' }], + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result)).toMatch( + /UNKNOWN_IDENTIFIER|Missing columns|unknown/i, + ); + }); + }); + + it('should not leak _errorCategory on the wire result', async () => { + const result = await callTool(client, 'clickstack_sql', { + connectionId: connection._id.toString(), + sql: 'SELECTT 1', + }); + + expect(result.isError).toBe(true); + expect(result).not.toHaveProperty('_errorCategory'); + }); + + describe('infrastructure errors (unreachable ClickHouse)', () => { + let deadConnection: any; + let deadSource: SourceDocument; + + beforeEach(async () => { + deadConnection = await Connection.create({ + team: team._id, + name: 'Dead', + host: 'http://localhost:1', + username: 'default', + password: '', + }); + + deadSource = await Source.create({ + kind: SourceKind.Trace, + team: team._id, + from: { + databaseName: DEFAULT_DATABASE, + tableName: DEFAULT_TRACES_TABLE, + }, + timestampValueExpression: 'Timestamp', + connection: deadConnection._id, + name: 'Dead Traces', + }); + }); + + it('should return isError for clickstack_sql against unreachable host', async () => { + const result = await callTool(client, 'clickstack_sql', { + connectionId: deadConnection._id.toString(), + sql: 'SELECT 1', + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result).length).toBeGreaterThan(0); + }); + + it('should return isError for clickstack_table against unreachable host', async () => { + const result = await callTool(client, 'clickstack_table', { + sourceId: deadSource._id.toString(), + select: [{ aggFn: 'count' }], + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result).length).toBeGreaterThan(0); + }); + + it('should return isError for clickstack_timeseries against unreachable host', async () => { + const result = await callTool(client, 'clickstack_timeseries', { + sourceId: deadSource._id.toString(), + select: [{ aggFn: 'count' }], + }); + + expect(result.isError).toBe(true); + expect(getFirstText(result).length).toBeGreaterThan(0); + }); + }); + }); + // ─── Safety settings (readonly + max_execution_time) ───────────────────────── describe('ClickHouse safety settings', () => { diff --git a/packages/api/src/mcp/__tests__/tracing.test.ts b/packages/api/src/mcp/__tests__/tracing.test.ts index e38fa18682..bc52bfc0ed 100644 --- a/packages/api/src/mcp/__tests__/tracing.test.ts +++ b/packages/api/src/mcp/__tests__/tracing.test.ts @@ -72,6 +72,7 @@ jest.mock('@/utils/logger', () => ({ }, })); +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import { withToolTracing } from '@/mcp/utils/tracing'; describe('withToolTracing', () => { @@ -196,7 +197,10 @@ describe('withToolTracing', () => { const traced = withToolTracing('my_tool', context, handler); await traced({}); - expect(mockCounter.add).toHaveBeenCalledWith(1, { tool: 'my_tool' }); + expect(mockCounter.add).toHaveBeenCalledWith(1, { + tool: 'my_tool', + error_category: 'server', + }); }); it('should increment the error counter on a thrown exception', async () => { @@ -206,9 +210,90 @@ describe('withToolTracing', () => { await expect(traced({})).rejects.toThrow('boom'); - expect(mockCounter.add).toHaveBeenCalledWith(1, { tool: 'my_tool' }); + expect(mockCounter.add).toHaveBeenCalledWith(1, { + tool: 'my_tool', + error_category: 'server', + }); expect(mockHistogram.record).toHaveBeenCalledWith(expect.any(Number), { tool: 'my_tool', }); }); + + // ─── Error category tests ─────────────────────────────────────────────── + + it('should default error_category to "server" when no category is set', async () => { + const handler = jest.fn().mockResolvedValue({ + isError: true, + content: [{ type: 'text', text: 'unknown failure' }], + }); + + const traced = withToolTracing('my_tool', context, handler); + await traced({}); + + expect(mockSpan.setAttribute).toHaveBeenCalledWith( + 'mcp.tool.error_category', + 'server', + ); + }); + + it('should record error_category "user" on span and counter', async () => { + const handler = jest.fn().mockResolvedValue(mcpUserError('bad input')); + + const traced = withToolTracing('my_tool', context, handler); + await traced({}); + + expect(mockSpan.setAttribute).toHaveBeenCalledWith( + 'mcp.tool.error_category', + 'user', + ); + expect(mockCounter.add).toHaveBeenCalledWith(1, { + tool: 'my_tool', + error_category: 'user', + }); + }); + + it('should record error_category "server" on span and counter', async () => { + const handler = jest + .fn() + .mockResolvedValue(mcpServerError('database timeout')); + + const traced = withToolTracing('my_tool', context, handler); + await traced({}); + + expect(mockSpan.setAttribute).toHaveBeenCalledWith( + 'mcp.tool.error_category', + 'server', + ); + expect(mockCounter.add).toHaveBeenCalledWith(1, { + tool: 'my_tool', + error_category: 'server', + }); + }); + + it('should not set error_category on successful results', async () => { + const handler = jest.fn().mockResolvedValue({ + content: [{ type: 'text', text: 'ok' }], + }); + + const traced = withToolTracing('my_tool', context, handler); + await traced({}); + + expect(mockSpan.setAttribute).not.toHaveBeenCalledWith( + 'mcp.tool.error_category', + expect.anything(), + ); + }); + + it('should not leak error category metadata on the returned result', async () => { + const handler = jest.fn().mockResolvedValue(mcpUserError('bad input')); + + const traced = withToolTracing('my_tool', context, handler); + const result = await traced({}); + + expect(result).toEqual({ + isError: true, + content: [{ type: 'text', text: 'bad input' }], + }); + expect(result).not.toHaveProperty('_errorCategory'); + }); }); diff --git a/packages/api/src/mcp/tools/alerts/getAlert.ts b/packages/api/src/mcp/tools/alerts/getAlert.ts index 443a403952..9250f7c308 100644 --- a/packages/api/src/mcp/tools/alerts/getAlert.ts +++ b/packages/api/src/mcp/tools/alerts/getAlert.ts @@ -7,7 +7,7 @@ import * as config from '@/config'; import { getRecentAlertHistories } from '@/controllers/alertHistory'; import { getAlertById } from '@/controllers/alerts'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { validateObjectId } from '@/mcp/utils/errors'; +import { mcpUserError, validateObjectId } from '@/mcp/utils/errors'; import Alert from '@/models/alert'; import type { IDashboard } from '@/models/dashboard'; import type { ISavedSearch } from '@/models/savedSearch'; @@ -114,10 +114,7 @@ export function registerGetAlert({ const alert = await getAlertById(id, teamId); if (!alert) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Alert not found' }], - }; + return mcpUserError('Alert not found'); } const external = translateAlertDocumentToExternalAlert(alert); diff --git a/packages/api/src/mcp/tools/alerts/saveAlert.ts b/packages/api/src/mcp/tools/alerts/saveAlert.ts index 0494c98bc1..ee0bfde733 100644 --- a/packages/api/src/mcp/tools/alerts/saveAlert.ts +++ b/packages/api/src/mcp/tools/alerts/saveAlert.ts @@ -9,7 +9,11 @@ import { validateAlertInput, } from '@/controllers/alerts'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { mcpError, validateObjectId } from '@/mcp/utils/errors'; +import { + mcpServerError, + mcpUserError, + validateObjectId, +} from '@/mcp/utils/errors'; import { type AlertChannel, AlertSource } from '@/models/alert'; import { BaseError } from '@/utils/errors'; import { translateAlertDocumentToExternalAlert } from '@/utils/externalApi'; @@ -52,7 +56,7 @@ export function registerSaveAlert({ // ── Runtime cross-field validation ── const validationError = validateSaveAlertInput(input); if (validationError) { - return mcpError(validationError); + return mcpUserError(validationError); } // ── Validate ID for updates (early return narrows input.id to string) ── @@ -96,7 +100,7 @@ export function registerSaveAlert({ : e instanceof Error ? e.message : String(e); - return mcpError(msg); + return e instanceof BaseError ? mcpUserError(msg) : mcpServerError(msg); } const mongoUserId = new mongoose.Types.ObjectId(userId); @@ -105,7 +109,7 @@ export function registerSaveAlert({ if (alertId) { const updated = await updateAlert(alertId, mongoTeamId, alertInput); if (!updated) { - return mcpError('Alert not found'); + return mcpUserError('Alert not found'); } return { content: [ diff --git a/packages/api/src/mcp/tools/dashboards/deleteDashboard.ts b/packages/api/src/mcp/tools/dashboards/deleteDashboard.ts index 88236f951d..bd60c9ee8d 100644 --- a/packages/api/src/mcp/tools/dashboards/deleteDashboard.ts +++ b/packages/api/src/mcp/tools/dashboards/deleteDashboard.ts @@ -3,6 +3,7 @@ import { z } from 'zod'; import { deleteDashboard } from '@/controllers/dashboard'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import { objectIdSchema } from '@/utils/zod'; @@ -29,10 +30,7 @@ export function registerDeleteDashboard({ team: teamId, }).lean(); if (!existing) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } await deleteDashboard(dashboardId, new mongoose.Types.ObjectId(teamId)); diff --git a/packages/api/src/mcp/tools/dashboards/getDashboard.ts b/packages/api/src/mcp/tools/dashboards/getDashboard.ts index 5d7b0a8e4f..e0de423f0c 100644 --- a/packages/api/src/mcp/tools/dashboards/getDashboard.ts +++ b/packages/api/src/mcp/tools/dashboards/getDashboard.ts @@ -4,7 +4,7 @@ import { z } from 'zod'; import * as config from '@/config'; import { getDashboards } from '@/controllers/dashboard'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { validateObjectId } from '@/mcp/utils/errors'; +import { mcpUserError, validateObjectId } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import { convertToExternalDashboard } from '@/routers/external-api/v2/utils/dashboards'; @@ -54,10 +54,7 @@ export function registerGetDashboard({ const dashboard = await Dashboard.findOne({ _id: id, team: teamId }); if (!dashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } return { content: [ diff --git a/packages/api/src/mcp/tools/dashboards/getDashboardTile.ts b/packages/api/src/mcp/tools/dashboards/getDashboardTile.ts index 4e4e926cca..5298cac0ec 100644 --- a/packages/api/src/mcp/tools/dashboards/getDashboardTile.ts +++ b/packages/api/src/mcp/tools/dashboards/getDashboardTile.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import { convertToExternalDashboard } from '@/routers/external-api/v2/utils/dashboards'; import { objectIdSchema } from '@/utils/zod'; @@ -36,24 +37,15 @@ export function registerGetDashboardTile({ team: teamId, }); if (!dashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } const externalDashboard = convertToExternalDashboard(dashboard); const tile = externalDashboard.tiles.find(t => t.id === tileId); if (!tile) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => `${t.id} (${t.name})`).join(', ')}`, - }, - ], - }; + return mcpUserError( + `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => `${t.id} (${t.name})`).join(', ')}`, + ); } return { diff --git a/packages/api/src/mcp/tools/dashboards/patchDashboard.ts b/packages/api/src/mcp/tools/dashboards/patchDashboard.ts index 2f1c34696d..ff0a9fac83 100644 --- a/packages/api/src/mcp/tools/dashboards/patchDashboard.ts +++ b/packages/api/src/mcp/tools/dashboards/patchDashboard.ts @@ -2,7 +2,7 @@ import { uniq } from 'lodash'; import * as config from '@/config'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { mcpError } from '@/mcp/utils/errors'; +import { mcpUserError } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import { cleanupDashboardAlerts, @@ -47,26 +47,14 @@ export function registerPatchDashboard({ tags === undefined && (tileId === undefined || inputTile === undefined) ) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'Provide at least one of: name, tags, or tileId+tile to patch.', - }, - ], - }; + return mcpUserError( + 'Provide at least one of: name, tags, or tileId+tile to patch.', + ); } if ((tileId === undefined) !== (inputTile === undefined)) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'tileId and tile must both be provided or both omitted.', - }, - ], - }; + return mcpUserError( + 'tileId and tile must both be provided or both omitted.', + ); } const existingDashboard = await Dashboard.findOne({ @@ -74,10 +62,7 @@ export function registerPatchDashboard({ team: teamId, }); if (!existingDashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } // Build the $set payload and the query filter. Metadata fields @@ -112,15 +97,9 @@ export function registerPatchDashboard({ // Convert only for the error message (read-only, no write-back). const externalDashboard = convertToExternalDashboard(existingDashboard); - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => `${t.id} (${t.name})`).join(', ')}`, - }, - ], - }; + return mcpUserError( + `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => `${t.id} (${t.name})`).join(', ')}`, + ); } // Read the existing tile's layout and container refs from the @@ -167,7 +146,7 @@ export function registerPatchDashboard({ // macros which require a source to be set const sqlFilterSourceError = getRawSqlMissingSourceError([mergedTile]); if (sqlFilterSourceError) { - return mcpError(sqlFilterSourceError); + return mcpUserError(sqlFilterSourceError); } // Validate the patched tile using the shared validation helper. @@ -178,23 +157,12 @@ export function registerPatchDashboard({ containers: existingDashboard.containers ?? [], }); if (validationError) { - return { - isError: true, - content: [{ type: 'text' as const, text: validationError }], - }; + return mcpUserError(validationError); } // Convert only the patched tile to internal format. if (!isConfigTile(mergedTile)) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'Tile must have a config block.', - }, - ], - }; + return mcpUserError('Tile must have a config block.'); } const internalTile = convertToInternalTileConfig(mergedTile); @@ -219,22 +187,12 @@ export function registerPatchDashboard({ // was removed or the dashboard was deleted between our read // and this write. if (tileId !== undefined) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: - `Tile ${tileId} was not found at write time (it may have been removed by a concurrent update). ` + - 'The entire update was rejected — name/tags changes (if any) were not applied. Resubmit.', - }, - ], - }; + return mcpUserError( + `Tile ${tileId} was not found at write time (it may have been removed by a concurrent update). ` + + 'The entire update was rejected — name/tags changes (if any) were not applied. Resubmit.', + ); } - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } // Reconcile alerts: if the tile's displayType changed to one diff --git a/packages/api/src/mcp/tools/dashboards/queryTile.ts b/packages/api/src/mcp/tools/dashboards/queryTile.ts index b345997bf9..7c00ccb3e7 100644 --- a/packages/api/src/mcp/tools/dashboards/queryTile.ts +++ b/packages/api/src/mcp/tools/dashboards/queryTile.ts @@ -2,6 +2,7 @@ import { z } from 'zod'; import { parseTimeRange, runConfigTile } from '@/mcp/tools/query/helpers'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import { convertToExternalDashboard } from '@/routers/external-api/v2/utils/dashboards'; import { objectIdSchema } from '@/utils/zod'; @@ -47,10 +48,7 @@ export function registerQueryTile({ async ({ dashboardId, tileId, startTime, endTime }) => { const timeRange = parseTimeRange(startTime, endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; @@ -59,24 +57,15 @@ export function registerQueryTile({ team: teamId, }); if (!dashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } const externalDashboard = convertToExternalDashboard(dashboard); const tile = externalDashboard.tiles.find(t => t.id === tileId); if (!tile) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => t.id).join(', ')}`, - }, - ], - }; + return mcpUserError( + `Tile not found: ${tileId}. Available tile IDs: ${externalDashboard.tiles.map(t => t.id).join(', ')}`, + ); } const result = await runConfigTile( diff --git a/packages/api/src/mcp/tools/dashboards/saveDashboard.ts b/packages/api/src/mcp/tools/dashboards/saveDashboard.ts index 7afcc1c463..cfafece00c 100644 --- a/packages/api/src/mcp/tools/dashboards/saveDashboard.ts +++ b/packages/api/src/mcp/tools/dashboards/saveDashboard.ts @@ -5,7 +5,7 @@ import { z } from 'zod'; import * as config from '@/config'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { mcpError } from '@/mcp/utils/errors'; +import { mcpUserError } from '@/mcp/utils/errors'; import Dashboard, { IDashboard } from '@/models/dashboard'; import { cleanupDashboardAlerts, @@ -159,15 +159,9 @@ async function createDashboard({ filters: stripFilterIds(inputFilters), }); if (!parsed.success) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Validation error: ${JSON.stringify(parsed.error.errors)}`, - }, - ], - }; + return mcpUserError( + `Validation error: ${JSON.stringify(parsed.error.errors)}`, + ); } const { tiles, filters, containers: parsedContainers } = parsed.data; @@ -175,7 +169,7 @@ async function createDashboard({ const sqlFilterSourceError = getRawSqlMissingSourceError(tilesWithId); if (sqlFilterSourceError) { - return mcpError(sqlFilterSourceError); + return mcpUserError(sqlFilterSourceError); } const validationError = await validateDashboardTiles({ @@ -185,10 +179,7 @@ async function createDashboard({ containers: parsedContainers ?? [], }); if (validationError) { - return { - isError: true, - content: [{ type: 'text' as const, text: validationError }], - }; + return mcpUserError(validationError); } const macroWarnings = getRawSqlTileMacroWarnings(tilesWithId); @@ -264,15 +255,9 @@ async function updateDashboard({ filters: assignFilterIds(inputFilters), }); if (!parsed.success) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Validation error: ${JSON.stringify(parsed.error.errors)}`, - }, - ], - }; + return mcpUserError( + `Validation error: ${JSON.stringify(parsed.error.errors)}`, + ); } const { tiles, filters, containers: parsedContainers } = parsed.data; @@ -280,7 +265,7 @@ async function updateDashboard({ const sqlFilterSourceError = getRawSqlMissingSourceError(tilesWithId); if (sqlFilterSourceError) { - return mcpError(sqlFilterSourceError); + return mcpUserError(sqlFilterSourceError); } const existingDashboard = await Dashboard.findOne( @@ -289,10 +274,7 @@ async function updateDashboard({ ).lean(); if (!existingDashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } const effectiveContainers = @@ -305,10 +287,7 @@ async function updateDashboard({ containers: effectiveContainers, }); if (validationError) { - return { - isError: true, - content: [{ type: 'text' as const, text: validationError }], - }; + return mcpUserError(validationError); } const macroWarnings = getRawSqlTileMacroWarnings(tilesWithId); @@ -365,10 +344,7 @@ async function updateDashboard({ ); if (!updatedDashboard) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Dashboard not found' }], - }; + return mcpUserError('Dashboard not found'); } await cleanupDashboardAlerts({ diff --git a/packages/api/src/mcp/tools/dashboards/searchDashboards.ts b/packages/api/src/mcp/tools/dashboards/searchDashboards.ts index 9fb29f5e38..f2418f6a48 100644 --- a/packages/api/src/mcp/tools/dashboards/searchDashboards.ts +++ b/packages/api/src/mcp/tools/dashboards/searchDashboards.ts @@ -2,6 +2,7 @@ import { escapeRegExp } from 'lodash'; import * as config from '@/config'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import Dashboard from '@/models/dashboard'; import logger from '@/utils/logger'; @@ -31,15 +32,9 @@ export function registerSearchDashboards({ const hasTags = Array.isArray(tags) && tags.length > 0; if (!hasQuery && !hasTags) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'Provide at least one of: query (non-empty string) or tags (non-empty array).', - }, - ], - }; + return mcpUserError( + 'Provide at least one of: query (non-empty string) or tags (non-empty array).', + ); } const filter: Record = { team: teamId }; @@ -92,15 +87,9 @@ export function registerSearchDashboards({ { err, teamId, query, tags }, 'clickstack_search_dashboards: query failed', ); - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Search failed: ${err instanceof Error ? err.message : String(err)}`, - }, - ], - }; + return mcpServerError( + `Search failed: ${err instanceof Error ? err.message : String(err)}`, + ); } }, ); diff --git a/packages/api/src/mcp/tools/query/eventDeltas.ts b/packages/api/src/mcp/tools/query/eventDeltas.ts index 96e4a775a3..6976ab42a4 100644 --- a/packages/api/src/mcp/tools/query/eventDeltas.ts +++ b/packages/api/src/mcp/tools/query/eventDeltas.ts @@ -15,9 +15,10 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import { trimToolResponse } from '@/utils/trimToolResponse'; -import { parseTimeRange } from './helpers'; +import { clickHouseErrorResult, parseTimeRange } from './helpers'; // ─── Schema ────────────────────────────────────────────────────────────────── @@ -200,30 +201,14 @@ export function registerEventDeltas({ context, registerTool }: ToolRegistrar) { input.target.endTime, ); if ('error' in targetRange) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `target: ${targetRange.error}`, - }, - ], - }; + return mcpUserError(`target: ${targetRange.error}`); } const baselineRange = parseTimeRange( input.baseline.startTime, input.baseline.endTime, ); if ('error' in baselineRange) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `baseline: ${baselineRange.error}`, - }, - ], - }; + return mcpUserError(`baseline: ${baselineRange.error}`); } // Refuse calls where target and baseline can't possibly produce a @@ -240,47 +225,28 @@ export function registerEventDeltas({ context, registerTool }: ToolRegistrar) { targetRange.startDate.getTime() === baselineRange.startDate.getTime() && targetRange.endDate.getTime() === baselineRange.endDate.getTime(); if (sameWhere && sameWindow) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'target and baseline are identical (same where + same time ' + - 'window) — that yields nothing to compare. Pick distinct ' + - 'groups: e.g. (a) before vs after a step change in time, ' + - '(b) failing rows vs succeeding rows, (c) slow spans ' + - '(Duration > X) vs fast spans (Duration <= X) in the same ' + - 'window, or (d) one cohort vs the rest. At least one of ' + - '`where` or the time window must differ between the two ' + - 'groups.', - }, - ], - }; + return mcpUserError( + 'target and baseline are identical (same where + same time ' + + 'window) — that yields nothing to compare. Pick distinct ' + + 'groups: e.g. (a) before vs after a step change in time, ' + + '(b) failing rows vs succeeding rows, (c) slow spans ' + + '(Duration > X) vs fast spans (Duration <= X) in the same ' + + 'window, or (d) one cohort vs the rest. At least one of ' + + '`where` or the time window must differ between the two ' + + 'groups.', + ); } const source = await getSource(teamId.toString(), input.sourceId); if (!source) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, + ); } if (source.kind !== SourceKind.Trace && source.kind !== SourceKind.Log) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source ${input.sourceId} is kind="${source.kind}". clickstack_event_deltas requires a trace or log source.`, - }, - ], - }; + return mcpUserError( + `Source ${input.sourceId} is kind="${source.kind}". clickstack_event_deltas requires a trace or log source.`, + ); } const connection = await getConnectionById( @@ -289,15 +255,9 @@ export function registerEventDeltas({ context, registerTool }: ToolRegistrar) { true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found for source: ${input.sourceId}`, - }, - ], - }; + return mcpUserError( + `Connection not found for source: ${input.sourceId}`, + ); } const clickhouseClient = new ClickhouseClient({ @@ -381,15 +341,7 @@ export function registerEventDeltas({ context, registerTool }: ToolRegistrar) { columnMetaUnavailable = true; } } catch (e) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Failed to build sample queries: ${e instanceof Error ? e.message : String(e)}`, - }, - ], - }; + return clickHouseErrorResult(e, 'Failed to build sample queries'); } let targetRows: Record[]; @@ -424,15 +376,7 @@ export function registerEventDeltas({ context, registerTool }: ToolRegistrar) { baselineRows = baselineJson.data ?? []; } catch (e) { abortController.abort(); - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Failed to sample rows: ${e instanceof Error ? e.message : String(e)}`, - }, - ], - }; + return clickHouseErrorResult(e, 'Failed to sample rows'); } const ranked = rankProperties({ diff --git a/packages/api/src/mcp/tools/query/eventPatterns.ts b/packages/api/src/mcp/tools/query/eventPatterns.ts index 96fd9dc472..ef91339c2c 100644 --- a/packages/api/src/mcp/tools/query/eventPatterns.ts +++ b/packages/api/src/mcp/tools/query/eventPatterns.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import { parseTimeRange } from './helpers'; import { runEventPatterns } from './runEventPatterns'; @@ -93,10 +94,7 @@ export function registerEventPatterns({ async input => { const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; diff --git a/packages/api/src/mcp/tools/query/helpers.ts b/packages/api/src/mcp/tools/query/helpers.ts index 377750e28a..5a148a703e 100644 --- a/packages/api/src/mcp/tools/query/helpers.ts +++ b/packages/api/src/mcp/tools/query/helpers.ts @@ -1,3 +1,4 @@ +import { ClickHouseError } from '@clickhouse/client-common'; import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node'; import { getMetadata } from '@hyperdx/common-utils/dist/core/metadata'; import { @@ -22,6 +23,8 @@ import ms from 'ms'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; +import type { McpErrorResult } from '@/mcp/utils/errors'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import { convertToInternalTileConfig, isConfigTile, @@ -331,7 +334,7 @@ type SelectItemForKindCheck = { metricType?: unknown }; export function assertSourceKindMatchesSelect( source: { kind: string }, select: unknown, -): { isError: true; content: [{ type: 'text'; text: string }] } | null { +): McpErrorResult | null { // Raw-string select (rare on the builder path) — the renderer handles // it; no metric annotations to inspect. if (typeof select === 'string') return null; @@ -348,35 +351,21 @@ export function assertSourceKindMatchesSelect( const isMetricSource = source.kind === SourceKind.Metric; if (isMetricSource && metricItemCount === 0) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'Source kind is "metric", but no select item specifies metricType + metricName. ' + - 'Each select item on a metric source must set metricType ("gauge" | "sum" | "histogram") ' + - 'and metricName (e.g. metricName:"system.cpu.utilization"). Call ' + - 'clickstack_describe_source or clickstack_list_metrics to discover available metric names.', - }, - ], - }; + return mcpUserError( + 'Source kind is "metric", but no select item specifies metricType + metricName. ' + + 'Each select item on a metric source must set metricType ("gauge" | "sum" | "histogram") ' + + 'and metricName (e.g. metricName:"system.cpu.utilization"). Call ' + + 'clickstack_describe_source or clickstack_list_metrics to discover available metric names.', + ); } if (!isMetricSource && metricItemCount > 0) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - `Source kind is "${source.kind}", not metric — but ${metricItemCount} select item(s) ` + - 'set metricType. metricType + metricName only work on metric sources. ' + - 'Drop the metric fields to query this source, or call clickstack_list_sources to find a ' + - 'source whose kind is "metric".', - }, - ], - }; + return mcpUserError( + `Source kind is "${source.kind}", not metric — but ${metricItemCount} select item(s) ` + + 'set metricType. metricType + metricName only work on metric sources. ' + + 'Drop the metric fields to query this source, or call clickstack_list_sources to find a ' + + 'source whose kind is "metric".', + ); } return null; @@ -392,12 +381,7 @@ export async function runConfigTile( options?: { maxResults?: number; granularity?: string }, ) { if (!isConfigTile(tile)) { - return { - isError: true as const, - content: [ - { type: 'text' as const, text: 'Invalid tile: config field missing' }, - ], - }; + return mcpUserError('Invalid tile: config field missing'); } const internalTile = convertToInternalTileConfig(tile); @@ -422,15 +406,9 @@ export async function runConfigTile( const source = await getSource(teamId, builderConfig.source); if (!source) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source not found: ${builderConfig.source}. Call clickstack_list_sources to discover available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source not found: ${builderConfig.source}. Call clickstack_list_sources to discover available source IDs.`, + ); } // Reject metric-style select against a non-metric source (and vice @@ -447,15 +425,9 @@ export async function runConfigTile( true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found for source: ${builderConfig.source}`, - }, - ], - }; + return mcpUserError( + `Connection not found for source: ${builderConfig.source}`, + ); } const clickhouseClient = new ClickhouseClient({ @@ -597,15 +569,9 @@ export async function runConfigTile( true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found: ${savedConfig.connection}. Call clickstack_list_sources to discover available connection IDs.`, - }, - ], - }; + return mcpUserError( + `Connection not found: ${savedConfig.connection}. Call clickstack_list_sources to discover available connection IDs.`, + ); } const clickhouseClient = new ClickhouseClient({ @@ -637,23 +603,178 @@ export async function runConfigTile( // ─── Error hints ───────────────────────────────────────────────────────────── +/** + * ClickHouse server-side error types that indicate an infrastructure problem + * rather than a user query issue. These appear in `ClickHouseError.type` when + * the ClickHouse server itself reports a connection/network failure (e.g. to + * a replica or Zookeeper). + */ +const SERVER_CH_ERROR_TYPES = new Set([ + 'NETWORK_ERROR', + 'SOCKET_TIMEOUT', + 'POCO_EXCEPTION', + 'ALL_CONNECTION_TRIES_FAILED', +]); + +/** + * Node.js system error codes that indicate a TCP-level connection failure. + * These appear on plain `Error` objects thrown by the ClickHouse HTTP client + * when the server is unreachable — the client never gets an HTTP response, + * so no `ClickHouseError` is constructed. + */ +const SERVER_NODE_ERROR_CODES = new Set([ + 'ECONNREFUSED', + 'ECONNRESET', + 'ENOTFOUND', + 'ETIMEDOUT', + 'ENETUNREACH', + 'EHOSTUNREACH', + 'EPIPE', + 'UND_ERR_CONNECT_TIMEOUT', +]); + +/** + * Check whether an error is a ClickHouseError, using both `instanceof` + * and a constructor-name fallback. The fallback handles the case where + * multiple copies of `@clickhouse/client-common` are installed (e.g. + * the root workspace uses one version while `common-utils` bundles + * another). In that scenario, `instanceof` fails because the class + * identities are different even though the shapes are identical. + */ +function isClickHouseError( + err: unknown, +): err is ClickHouseError & { type: string } { + if (err instanceof ClickHouseError) return true; + if ( + err instanceof Error && + err.constructor?.name === 'ClickHouseError' && + 'type' in err && + typeof (err as Record).type === 'string' + ) { + return true; + } + return false; +} + +/** + * Extract the ClickHouse error `type` from an error, walking `.cause` to + * find the original `ClickHouseError` from `@clickhouse/client-common`. + * + * Uses `instanceof` with a constructor-name fallback to handle duplicate + * package installations across the monorepo (see `isClickHouseError`). + * + * @internal Exported for testing only. + */ +export function getClickHouseErrorType(e: unknown): string | undefined { + if (!(e instanceof Error)) return undefined; + // ClickHouseQueryError wraps the original ClickHouseError as .cause + if (isClickHouseError(e.cause)) { + return e.cause.type; + } + // Direct ClickHouseError (has .type on itself) + if (isClickHouseError(e)) { + return e.type; + } + return undefined; +} + +/** + * Check whether an error represents a system-level infrastructure failure + * rather than a user query issue. Covers both ClickHouse server-side error + * types (e.g. NETWORK_ERROR) and Node.js TCP-level errors (e.g. ECONNREFUSED) + * which the client throws as plain `Error` objects. + * + * @internal Exported for testing only. + */ +export function isServerError(e: unknown): boolean { + // Walk the full cause chain checking for both ClickHouse server-side + // error types and Node.js TCP-level errors at every depth. This + // ensures a ClickHouseError with NETWORK_ERROR nested at arbitrary + // depth is caught, not just at depth 0-1. + let current: unknown = e; + const seen = new Set(); // guard against circular .cause + while (current instanceof Error) { + if (seen.has(current)) break; + seen.add(current); + + // ClickHouse server-side error type at this level + if (isClickHouseError(current)) { + if (SERVER_CH_ERROR_TYPES.has(current.type)) return true; + } + // Also check .cause for a direct ClickHouseError (the common + // ClickHouseQueryError -> ClickHouseError pattern) + if (isClickHouseError(current.cause)) { + if (SERVER_CH_ERROR_TYPES.has(current.cause.type)) return true; + } + + // Node.js TCP/socket-level error + if (hasNodeErrorCode(current, SERVER_NODE_ERROR_CODES)) return true; + // AggregateError.errors may hold the real TCP error + if ( + current instanceof AggregateError && + current.errors.some(inner => + hasNodeErrorCode(inner, SERVER_NODE_ERROR_CODES), + ) + ) { + return true; + } + current = current.cause; + } + + return false; +} + +/** Type-safe check for a Node.js system error code on an unknown value. */ +function hasNodeErrorCode(val: unknown, codes: ReadonlySet): boolean { + if (!(val instanceof Error)) return false; + const code = + 'code' in val && typeof val.code === 'string' ? val.code : undefined; + return code != null && codes.has(code); +} + /** * Decorate raw ClickHouse error messages with actionable guidance before * they reach the agent. Some ClickHouse errors are unhelpful in isolation — * e.g. "Cannot convert string '...Z' to type DateTime64(9)" leaves the agent * guessing about the right format. Catch common patterns and append a hint. + * + * ClickHouse query errors are classified as user errors by default since the + * user/agent wrote the query that failed. Only infrastructure-level errors + * (network, socket, connection failures) are classified as server errors. + * + * @param e The caught error. + * @param context Optional message context. `prefix` is prepended (e.g. + * "Failed to sample rows") and `suffix` is appended (e.g. + * guidance about valid input). The categorization is derived + * from the underlying ClickHouse error type regardless of the + * surrounding context. */ -export function clickHouseErrorResult(e: unknown): { - isError: true; - content: [{ type: 'text'; text: string }]; -} { - const raw = e instanceof Error ? e.message : String(e); +export function clickHouseErrorResult( + e: unknown, + context?: string | { prefix?: string; suffix?: string }, +): McpErrorResult { + const { prefix, suffix } = + typeof context === 'string' + ? { prefix: context, suffix: undefined } + : (context ?? {}); + + // Prefer .message, but fall back to .cause.message when the wrapper + // (e.g. ClickHouseQueryError from common-utils) has an empty message + // and the real error details are in the cause chain. + const raw = + e instanceof Error + ? e.message || + (e.cause instanceof Error ? e.cause.message : '') || + String(e) + : String(e); const hint = errorHint(raw); - const text = hint ? `${raw}\n\nHINT: ${hint}` : raw; - return { - isError: true as const, - content: [{ type: 'text' as const, text }], - }; + const base = hint ? `${raw}\n\nHINT: ${hint}` : raw; + const text = `${prefix ? `${prefix}: ` : ''}${base}${suffix ? ` ${suffix}` : ''}`; + + // Default to a user error — the user wrote the query that failed. + // Only promote to a server error for known infrastructure error types + // (ClickHouse server-side errors or TCP-level connection failures). + return isServerError(e) ? mcpServerError(text) : mcpUserError(text); } /** @internal Exported for testing only. */ diff --git a/packages/api/src/mcp/tools/query/runEventPatterns.ts b/packages/api/src/mcp/tools/query/runEventPatterns.ts index 7fb9a7617d..3f22ee7736 100644 --- a/packages/api/src/mcp/tools/query/runEventPatterns.ts +++ b/packages/api/src/mcp/tools/query/runEventPatterns.ts @@ -10,6 +10,7 @@ import { DisplayType } from '@hyperdx/common-utils/dist/types'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import { trimToolResponse } from '@/utils/trimToolResponse'; import { @@ -41,15 +42,9 @@ export async function runEventPatterns( // ── Resolve source & connection ── const source = await getSource(teamId, sourceId); if (!source) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source not found: ${sourceId}. Call clickstack_list_sources to discover available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source not found: ${sourceId}. Call clickstack_list_sources to discover available source IDs.`, + ); } const connection = await getConnectionById( @@ -58,15 +53,9 @@ export async function runEventPatterns( true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found for source: ${sourceId}. Call clickstack_list_sources to discover available source IDs.`, - }, - ], - }; + return mcpUserError( + `Connection not found for source: ${sourceId}. Call clickstack_list_sources to discover available source IDs.`, + ); } // ── Determine body column ── @@ -77,36 +66,22 @@ export async function runEventPatterns( if (options?.bodyExpression) { const parts = splitAndTrimWithBracket(options.bodyExpression); if (parts.length !== 1 || !SAFE_BODY_EXPR_CHARS.test(parts[0])) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'bodyExpression must be a single column expression ' + - '(e.g. "Body", "SpanName", "SpanAttributes[\'http.url\']"). ' + - 'Multiple expressions, function calls, or sub-queries are not allowed.', - }, - ], - }; + return mcpUserError( + 'bodyExpression must be a single column expression ' + + '(e.g. "Body", "SpanName", "SpanAttributes[\'http.url\']"). ' + + 'Multiple expressions, function calls, or sub-queries are not allowed.', + ); } bodyColumn = parts[0]; } else { bodyColumn = resolveBodyExpression(source); } if (!bodyColumn) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'Could not determine body column for pattern mining. ' + - 'This source may not have a body/spanName expression configured. ' + - 'Try specifying bodyExpression explicitly.', - }, - ], - }; + return mcpUserError( + 'Could not determine body column for pattern mining. ' + + 'This source may not have a body/spanName expression configured. ' + + 'Try specifying bodyExpression explicitly.', + ); } const clickhouseClient = new ClickhouseClient({ @@ -247,15 +222,7 @@ export async function runEventPatterns( })); } catch (err) { const message = err instanceof Error ? err.message : String(err); - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Pattern mining failed: ${message}`, - }, - ], - }; + return mcpServerError(`Pattern mining failed: ${message}`); } // ── Format response ── diff --git a/packages/api/src/mcp/tools/query/schemas.ts b/packages/api/src/mcp/tools/query/schemas.ts index e000b0626a..ffc39460a9 100644 --- a/packages/api/src/mcp/tools/query/schemas.ts +++ b/packages/api/src/mcp/tools/query/schemas.ts @@ -1,6 +1,8 @@ import { z } from 'zod'; import { QUERYABLE_METRIC_KINDS } from '@/mcp/tools/sources/metricKinds'; +import type { McpErrorResult } from '@/mcp/utils/errors'; +import { mcpUserError } from '@/mcp/utils/errors'; // ─── Shared description fragments ──────────────────────────────────────────── @@ -340,7 +342,7 @@ export function applyMetricSelectDefaults( */ export function validateMetricSelectItems( items: ReadonlyArray, -): { isError: true; content: [{ type: 'text'; text: string }] } | null { +): McpErrorResult | null { const errors: string[] = []; items.forEach((item, idx) => { for (const issue of getMetricSelectIssues(item)) { @@ -348,15 +350,7 @@ export function validateMetricSelectItems( } }); if (errors.length === 0) return null; - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: errors.join('\n'), - }, - ], - }; + return mcpUserError(errors.join('\n')); } export const startTimeSchema = z diff --git a/packages/api/src/mcp/tools/query/search.ts b/packages/api/src/mcp/tools/query/search.ts index f4ca771754..e97ae33b64 100644 --- a/packages/api/src/mcp/tools/query/search.ts +++ b/packages/api/src/mcp/tools/query/search.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import logger from '@/utils/logger'; import { trimToolResponse } from '@/utils/trimToolResponse'; @@ -77,10 +78,7 @@ export function registerSearch({ context, registerTool }: ToolRegistrar) { async input => { const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; diff --git a/packages/api/src/mcp/tools/query/sql.ts b/packages/api/src/mcp/tools/query/sql.ts index ada9f9187d..fa2ee74f32 100644 --- a/packages/api/src/mcp/tools/query/sql.ts +++ b/packages/api/src/mcp/tools/query/sql.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import { buildTile, parseTimeRange, runConfigTile } from './helpers'; import { endTimeSchema, startTimeSchema } from './schemas'; @@ -74,10 +75,7 @@ export function registerSql({ context, registerTool }: ToolRegistrar) { async input => { const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; diff --git a/packages/api/src/mcp/tools/query/table.ts b/packages/api/src/mcp/tools/query/table.ts index cab977e979..8a48dae81e 100644 --- a/packages/api/src/mcp/tools/query/table.ts +++ b/packages/api/src/mcp/tools/query/table.ts @@ -1,6 +1,7 @@ import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import { annotateIncreaseTopNHint, @@ -232,10 +233,7 @@ export function registerTable({ context, registerTool }: ToolRegistrar) { async input => { const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; diff --git a/packages/api/src/mcp/tools/query/timeseries.ts b/packages/api/src/mcp/tools/query/timeseries.ts index ebfe23c7b1..181aa901d4 100644 --- a/packages/api/src/mcp/tools/query/timeseries.ts +++ b/packages/api/src/mcp/tools/query/timeseries.ts @@ -2,6 +2,7 @@ import { FIXED_TIME_BUCKET_EXPR_ALIAS } from '@hyperdx/common-utils/dist/core/re import { z } from 'zod'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; import { annotateIncreaseTopNHint, @@ -103,10 +104,7 @@ export function registerTimeseries({ context, registerTool }: ToolRegistrar) { async input => { const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; diff --git a/packages/api/src/mcp/tools/savedSearches/getSavedSearch.ts b/packages/api/src/mcp/tools/savedSearches/getSavedSearch.ts index dd2b044541..fdadb795d5 100644 --- a/packages/api/src/mcp/tools/savedSearches/getSavedSearch.ts +++ b/packages/api/src/mcp/tools/savedSearches/getSavedSearch.ts @@ -3,7 +3,7 @@ import { z } from 'zod'; import * as config from '@/config'; import { getSavedSearch } from '@/controllers/savedSearch'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { validateObjectId } from '@/mcp/utils/errors'; +import { mcpUserError, validateObjectId } from '@/mcp/utils/errors'; import { SavedSearch } from '@/models/savedSearch'; export function registerGetSavedSearch({ @@ -57,10 +57,7 @@ export function registerGetSavedSearch({ const savedSearch = await getSavedSearch(teamId, id); if (!savedSearch) { - return { - isError: true, - content: [{ type: 'text' as const, text: 'Saved search not found' }], - }; + return mcpUserError('Saved search not found'); } return { diff --git a/packages/api/src/mcp/tools/savedSearches/saveSavedSearch.ts b/packages/api/src/mcp/tools/savedSearches/saveSavedSearch.ts index 75eadbe686..85f4e25d41 100644 --- a/packages/api/src/mcp/tools/savedSearches/saveSavedSearch.ts +++ b/packages/api/src/mcp/tools/savedSearches/saveSavedSearch.ts @@ -6,7 +6,11 @@ import { } from '@/controllers/savedSearch'; import { getSource } from '@/controllers/sources'; import type { ToolRegistrar } from '@/mcp/tools/types'; -import { mcpError, validateObjectId } from '@/mcp/utils/errors'; +import { + mcpServerError, + mcpUserError, + validateObjectId, +} from '@/mcp/utils/errors'; import { mcpSaveSavedSearchSchema } from './schemas'; @@ -39,7 +43,7 @@ export function registerSaveSavedSearch({ // ── Validate sourceId (format validated by Zod schema, check existence) ── const source = await getSource(teamId, input.sourceId); if (!source) { - return mcpError('Source not found'); + return mcpUserError('Source not found'); } // Build the saved search data matching what the controller expects. @@ -58,12 +62,7 @@ export function registerSaveSavedSearch({ // Verify the saved search exists before updating. const existing = await getSavedSearch(teamId, input.id!); if (!existing) { - return { - isError: true, - content: [ - { type: 'text' as const, text: 'Saved search not found' }, - ], - }; + return mcpUserError('Saved search not found'); } const updated = await updateSavedSearch( @@ -74,15 +73,7 @@ export function registerSaveSavedSearch({ ); if (!updated) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'Failed to update saved search', - }, - ], - }; + return mcpServerError('Failed to update saved search'); } return { diff --git a/packages/api/src/mcp/tools/sources/describeMetric.ts b/packages/api/src/mcp/tools/sources/describeMetric.ts index d92ddcbd68..9d63433a81 100644 --- a/packages/api/src/mcp/tools/sources/describeMetric.ts +++ b/packages/api/src/mcp/tools/sources/describeMetric.ts @@ -13,7 +13,9 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; +import { clickHouseErrorResult } from '@/mcp/tools/query/helpers'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import logger from '@/utils/logger'; import { trimToolResponse } from '@/utils/trimToolResponse'; @@ -482,34 +484,19 @@ async function describeMetricImpl( ) { const source = await getSource(teamId, input.sourceId); if (!source) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${input.sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source "${input.sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, + ); } if (source.kind !== SourceKind.Metric) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${input.sourceId}" is a "${source.kind}" source, not a metric source. clickstack_describe_metric only works on metric sources.`, - }, - ], - }; + return mcpUserError( + `Source "${input.sourceId}" is a "${source.kind}" source, not a metric source. clickstack_describe_metric only works on metric sources.`, + ); } const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; @@ -519,15 +506,7 @@ async function describeMetricImpl( true, ); if (!connection) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Connection not found for source "${input.sourceId}".`, - }, - ], - }; + return mcpUserError(`Connection not found for source "${input.sourceId}".`); } const clickhouseClient = new ClickhouseClient({ @@ -544,15 +523,9 @@ async function describeMetricImpl( const kind = input.kind; const tableName = source.metricTables[kind]; if (!tableName) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${input.sourceId}" has no "${kind}" metric table populated. Populated kinds: ${Object.keys(source.metricTables).join(', ')}.`, - }, - ], - }; + return mcpUserError( + `Source "${input.sourceId}" has no "${kind}" metric table populated. Populated kinds: ${Object.keys(source.metricTables).join(', ')}.`, + ); } // Defensive column-presence check before referencing MetricUnit / @@ -569,15 +542,10 @@ async function describeMetricImpl( { kind, error: e instanceof Error ? e.message : String(e) }, 'describeMetric: getColumns failed', ); - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Failed to load columns for "${tableName}". The metric table may be missing or unreachable.`, - }, - ], - }; + return clickHouseErrorResult(e, { + prefix: `Failed to load columns for "${tableName}"`, + suffix: 'The metric table may be missing or unreachable.', + }); } const columnNames = new Set(columns.map(c => c.name)); const hasUnit = columnNames.has('MetricUnit'); @@ -776,19 +744,12 @@ export function registerDescribeMetric({ { teamId, sourceId: input.sourceId, metricName: input.metricName }, 'clickstack_describe_metric timed out', ); - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'Discovery timed out. The metric table may be under load or the ' + - 'attribute set may be very high-cardinality. Try narrowing ' + - 'startTime/endTime or setting sampleValues:false to skip the ' + - 'value-sampling stage.', - }, - ], - }; + return mcpServerError( + 'Discovery timed out. The metric table may be under load or the ' + + 'attribute set may be very high-cardinality. Try narrowing ' + + 'startTime/endTime or setting sampleValues:false to skip the ' + + 'value-sampling stage.', + ); } throw e; } finally { diff --git a/packages/api/src/mcp/tools/sources/describeSource.ts b/packages/api/src/mcp/tools/sources/describeSource.ts index 24621b58b6..69033303f2 100644 --- a/packages/api/src/mcp/tools/sources/describeSource.ts +++ b/packages/api/src/mcp/tools/sources/describeSource.ts @@ -14,6 +14,7 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import logger from '@/utils/logger'; import { trimToolResponse } from '@/utils/trimToolResponse'; @@ -239,15 +240,9 @@ async function describeSourceSchema( ) { const source = await getSource(teamId, sourceId); if (!source) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source "${sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, + ); } const meta: Record = { @@ -341,15 +336,7 @@ async function describeSourceSchema( true, ); if (!connection) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Connection not found for source "${sourceId}".`, - }, - ], - }; + return mcpUserError(`Connection not found for source "${sourceId}".`); } const clickhouseClient = new ClickhouseClient({ @@ -687,17 +674,10 @@ export function registerDescribeSource({ { teamId, sourceId }, 'clickstack_describe_source timed out', ); - return { - isError: true, - content: [ - { - type: 'text' as const, - text: - 'Schema discovery timed out. The ClickHouse server may be under load. ' + - 'Try again, or use clickstack_list_sources for basic source info without schema details.', - }, - ], - }; + return mcpServerError( + 'Schema discovery timed out. The ClickHouse server may be under load. ' + + 'Try again, or use clickstack_list_sources for basic source info without schema details.', + ); } logger.warn( { teamId, sourceId, error: e }, diff --git a/packages/api/src/mcp/tools/sources/listMetrics.ts b/packages/api/src/mcp/tools/sources/listMetrics.ts index 25edd418b3..6b7854d844 100644 --- a/packages/api/src/mcp/tools/sources/listMetrics.ts +++ b/packages/api/src/mcp/tools/sources/listMetrics.ts @@ -11,6 +11,7 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpServerError, mcpUserError } from '@/mcp/utils/errors'; import logger from '@/utils/logger'; import { @@ -291,17 +292,10 @@ export function registerListMetrics({ { teamId, sourceId: input.sourceId }, 'clickstack_list_metrics timed out', ); - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: - 'clickstack_list_metrics timed out. Try narrowing the time window ' + - '(startTime/endTime), pinning a single `kind`, or adding a namePattern filter.', - }, - ], - }; + return mcpServerError( + 'clickstack_list_metrics timed out. Try narrowing the time window ' + + '(startTime/endTime), pinning a single `kind`, or adding a namePattern filter.', + ); } throw e; } finally { @@ -318,34 +312,19 @@ async function listMetricsImpl( ) { const source = await getSource(teamId, input.sourceId); if (!source) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${input.sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source "${input.sourceId}" not found. Call clickstack_list_sources to see available source IDs.`, + ); } if (source.kind !== SourceKind.Metric) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Source "${input.sourceId}" is a "${source.kind}" source, not a metric source. clickstack_list_metrics only works on metric sources — call clickstack_list_sources to find one whose kind is "metric".`, - }, - ], - }; + return mcpUserError( + `Source "${input.sourceId}" is a "${source.kind}" source, not a metric source. clickstack_list_metrics only works on metric sources — call clickstack_list_sources to find one whose kind is "metric".`, + ); } const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; @@ -353,15 +332,9 @@ async function listMetricsImpl( // truncated or tampered cursor does not surface internals. const cursor = input.cursor ? decodeCursor(input.cursor) : null; if (input.cursor && !cursor) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: 'Invalid cursor. Omit cursor to start over, or pass the exact `nextCursor` value returned by a previous call.', - }, - ], - }; + return mcpUserError( + 'Invalid cursor. Omit cursor to start over, or pass the exact `nextCursor` value returned by a previous call.', + ); } // Resolve which kinds to scan, in order. When a cursor is set, @@ -374,15 +347,9 @@ async function listMetricsImpl( if (startKindIdx < 0) { // Cursor points at a kind that's not in scope for this call — // safer to error than silently skip. - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Cursor references kind "${cursor!.kind}" but that kind is not in scope for this call. Drop the kind filter or pass a matching cursor.`, - }, - ], - }; + return mcpUserError( + `Cursor references kind "${cursor!.kind}" but that kind is not in scope for this call. Drop the kind filter or pass a matching cursor.`, + ); } const connection = await getConnectionById( @@ -391,15 +358,7 @@ async function listMetricsImpl( true, ); if (!connection) { - return { - isError: true, - content: [ - { - type: 'text' as const, - text: `Connection not found for source "${input.sourceId}".`, - }, - ], - }; + return mcpUserError(`Connection not found for source "${input.sourceId}".`); } const clickhouseClient = new ClickhouseClient({ diff --git a/packages/api/src/mcp/tools/trace/breakdown.ts b/packages/api/src/mcp/tools/trace/breakdown.ts index 20fca3787c..ac902b7b3d 100644 --- a/packages/api/src/mcp/tools/trace/breakdown.ts +++ b/packages/api/src/mcp/tools/trace/breakdown.ts @@ -21,8 +21,12 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; -import { parseTimeRange } from '@/mcp/tools/query/helpers'; +import { + clickHouseErrorResult, + parseTimeRange, +} from '@/mcp/tools/query/helpers'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; // ─── Schema ────────────────────────────────────────────────────────────────── @@ -157,35 +161,20 @@ export function registerTraceBreakdown({ const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true as const, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; const source = await getSource(teamId.toString(), input.sourceId); if (!source) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, + ); } if (source.kind !== SourceKind.Trace) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source ${input.sourceId} is kind="${source.kind}". clickstack_trace_top_time_consuming_operations requires a source of kind="trace".`, - }, - ], - }; + return mcpUserError( + `Source ${input.sourceId} is kind="${source.kind}". clickstack_trace_top_time_consuming_operations requires a source of kind="trace".`, + ); } const connection = await getConnectionById( @@ -194,15 +183,9 @@ export function registerTraceBreakdown({ true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found for source: ${input.sourceId}`, - }, - ], - }; + return mcpUserError( + `Connection not found for source: ${input.sourceId}`, + ); } // Source-configured SQL expressions. These are trusted (set by the @@ -310,15 +293,11 @@ LIMIT {topN:UInt32} ).json()) ?? { data: [] }; rows = json.data ?? []; } catch (e) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Failed to compute breakdown: ${e instanceof Error ? e.message : String(e)}. The parentFilter must be valid ClickHouse SQL referencing columns on the trace table (e.g. ServiceName = 'X' AND SpanName = 'Y').`, - }, - ], - }; + return clickHouseErrorResult(e, { + prefix: 'Failed to compute breakdown', + suffix: + "The parentFilter must be valid ClickHouse SQL referencing columns on the trace table (e.g. ServiceName = 'X' AND SpanName = 'Y').", + }); } // Normalise numerics — ClickHouse JSON sometimes returns strings for diff --git a/packages/api/src/mcp/tools/trace/waterfall.ts b/packages/api/src/mcp/tools/trace/waterfall.ts index 57611852a1..dc11af473f 100644 --- a/packages/api/src/mcp/tools/trace/waterfall.ts +++ b/packages/api/src/mcp/tools/trace/waterfall.ts @@ -10,8 +10,12 @@ import { z } from 'zod'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; -import { parseTimeRange } from '@/mcp/tools/query/helpers'; +import { + clickHouseErrorResult, + parseTimeRange, +} from '@/mcp/tools/query/helpers'; import type { ToolRegistrar } from '@/mcp/tools/types'; +import { mcpUserError } from '@/mcp/utils/errors'; // ─── Schema ────────────────────────────────────────────────────────────────── @@ -210,35 +214,20 @@ export function registerTraceWaterfall({ const input = traceSchema.parse(rawInput); const timeRange = parseTimeRange(input.startTime, input.endTime); if ('error' in timeRange) { - return { - isError: true as const, - content: [{ type: 'text' as const, text: timeRange.error }], - }; + return mcpUserError(timeRange.error); } const { startDate, endDate } = timeRange; const source = await getSource(teamId.toString(), input.sourceId); if (!source) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, - }, - ], - }; + return mcpUserError( + `Source not found: ${input.sourceId}. Call clickstack_list_sources to find available source IDs.`, + ); } if (source.kind !== SourceKind.Trace) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Source ${input.sourceId} is kind="${source.kind}". clickstack_trace_waterfall requires a source of kind="trace".`, - }, - ], - }; + return mcpUserError( + `Source ${input.sourceId} is kind="${source.kind}". clickstack_trace_waterfall requires a source of kind="trace".`, + ); } const connection = await getConnectionById( @@ -247,15 +236,9 @@ export function registerTraceWaterfall({ true, ); if (!connection) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Connection not found for source: ${input.sourceId}`, - }, - ], - }; + return mcpUserError( + `Connection not found for source: ${input.sourceId}`, + ); } const clickhouseClient = new ClickhouseClient({ @@ -341,15 +324,7 @@ export function registerTraceWaterfall({ querySettings: source.querySettings, })) as { data?: Array> }; } catch (e) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Failed to pick a trace: ${e instanceof Error ? e.message : String(e)}`, - }, - ], - }; + return clickHouseErrorResult(e, 'Failed to pick a trace'); } const firstRow = pickResult.data?.[0]; @@ -380,15 +355,9 @@ export function registerTraceWaterfall({ ([k]) => k !== 'span_count' && k !== '__hdx_time_bucket', ); if (!candidate || candidate[1] == null) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Picker returned a row but no TraceId column was found. Row keys: ${Object.keys(firstRow).join(', ')}`, - }, - ], - }; + return mcpUserError( + `Picker returned a row but no TraceId column was found. Row keys: ${Object.keys(firstRow).join(', ')}`, + ); } pickedTraceId = String(candidate[1]); } @@ -438,15 +407,10 @@ export function registerTraceWaterfall({ rows = (await (result as { json: () => Promise }).json()) ?? []; } catch (e) { - return { - isError: true as const, - content: [ - { - type: 'text' as const, - text: `Failed to fetch trace ${pickedTraceId}: ${e instanceof Error ? e.message : String(e)}`, - }, - ], - }; + return clickHouseErrorResult( + e, + `Failed to fetch trace ${pickedTraceId}`, + ); } const truncated = rows.length > input.maxSpans; diff --git a/packages/api/src/mcp/utils/errors.ts b/packages/api/src/mcp/utils/errors.ts index 7c22a217b0..31c85cf996 100644 --- a/packages/api/src/mcp/utils/errors.ts +++ b/packages/api/src/mcp/utils/errors.ts @@ -1,18 +1,63 @@ import mongoose from 'mongoose'; -type McpErrorResult = { +/** + * Error category for MCP tool failures. + * + * - `user` — the agent/user caused the error (bad input, not-found, + * bad query syntax, wrong source kind, etc.). Not alertable. + * - `server` — unexpected system failure (uncaught exception, database + * outage, timeout on a system-controlled query, etc.). + * Alertable. + */ +export type McpErrorCategory = 'user' | 'server'; + +export type McpErrorResult = { isError: true; content: [{ type: 'text'; text: string }]; }; /** - * Build a standard MCP error response. + * Side-channel for error categories. Using a WeakMap instead of an object + * property ensures `_errorCategory` can never leak through the MCP SDK's + * `z.looseObject()` passthrough serialization, even if a result bypasses + * `withToolTracing`. */ -export function mcpError(text: string): McpErrorResult { - return { +const errorCategoryMap = new WeakMap(); + +/** Retrieve the error category for a result, if one was set. */ +export function getErrorCategory( + result: McpErrorResult, +): McpErrorCategory | undefined { + return errorCategoryMap.get(result); +} + +function buildMcpError( + text: string, + category: McpErrorCategory, +): McpErrorResult { + const result: McpErrorResult = { isError: true as const, content: [{ type: 'text' as const, text }], }; + errorCategoryMap.set(result, category); + return result; +} + +/** + * Build an MCP error response for an agent/user-caused error (bad input, + * not-found, bad query syntax, etc.). Not alertable. + */ +export function mcpUserError(text: string): McpErrorResult { + return buildMcpError(text, 'user'); +} + +/** + * Build an MCP error response for an unexpected system failure (uncaught + * exception, database outage, timeout on a system-controlled query, etc.). + * Alertable. + */ +export function mcpServerError(text: string): McpErrorResult { + return buildMcpError(text, 'server'); } /** @@ -24,7 +69,7 @@ export function validateObjectId( label: string, ): McpErrorResult | null { if (!mongoose.Types.ObjectId.isValid(id)) { - return mcpError(`Invalid ${label}`); + return mcpUserError(`Invalid ${label}`); } return null; } diff --git a/packages/api/src/mcp/utils/tracing.ts b/packages/api/src/mcp/utils/tracing.ts index d21fe1cb64..56b1971086 100644 --- a/packages/api/src/mcp/utils/tracing.ts +++ b/packages/api/src/mcp/utils/tracing.ts @@ -1,6 +1,8 @@ import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js'; import type { McpContext, ToolResult } from '@/mcp/tools/types'; +import type { McpErrorCategory, McpErrorResult } from '@/mcp/utils/errors'; +import { getErrorCategory } from '@/mcp/utils/errors'; import { getCounter, getHistogram, @@ -54,11 +56,20 @@ export function withToolTracing( const durationMs = Date.now() - startTime; if (result.isError) { + // Default to 'server' when category is not set — safe default + // that surfaces un-classified errors in alerts. + const errorCategory: McpErrorCategory = + getErrorCategory(result as McpErrorResult) ?? 'server'; + span.setStatus({ code: SpanStatusCode.ERROR }); span.setAttribute('mcp.tool.error', true); - toolErrorCounter.add(1, { tool: toolName }); + span.setAttribute('mcp.tool.error_category', errorCategory); + toolErrorCounter.add(1, { + tool: toolName, + error_category: errorCategory, + }); logger.warn( - { ...logContext, durationMs }, + { ...logContext, durationMs, errorCategory }, `MCP tool error: ${toolName}`, ); } else { @@ -75,8 +86,12 @@ export function withToolTracing( } catch (err) { const durationMs = Date.now() - startTime; span.setAttribute('mcp.tool.duration_ms', durationMs); + span.setAttribute('mcp.tool.error_category', 'server'); toolDurationHistogram.record(durationMs, { tool: toolName }); - toolErrorCounter.add(1, { tool: toolName }); + toolErrorCounter.add(1, { + tool: toolName, + error_category: 'server', + }); logger.error( { ...logContext, durationMs, error: err }, diff --git a/yarn.lock b/yarn.lock index 2ded158aa2..af1744fdf2 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4442,6 +4442,7 @@ __metadata: "@ai-sdk/anthropic": "npm:^3.0.58" "@ai-sdk/openai": "npm:^3.0.47" "@braintree/sanitize-url": "npm:^7.1.1" + "@clickhouse/client-common": "npm:1.23.0-head.fae5998.1" "@esm2cjs/p-queue": "npm:^7.3.0" "@hyperdx/common-utils": "npm:^0.21.0" "@hyperdx/node-opentelemetry": "npm:^0.9.0"