diff --git a/.changeset/exemplar-mode-metrics.md b/.changeset/exemplar-mode-metrics.md new file mode 100644 index 0000000000..dda84d862c --- /dev/null +++ b/.changeset/exemplar-mode-metrics.md @@ -0,0 +1,22 @@ +--- +"@hyperdx/common-utils": minor +"@hyperdx/api": minor +"@hyperdx/app": minor +--- + +feat: add exemplar overlay for metric and PromQL charts + +Time charts on metric and PromQL sources can now overlay exemplars — +individual data points linked to a trace — via the "Exemplars" toggle in the +chart editor (next to "As Ratio" for metric charts, in the PromQL editor for +PromQL charts). Markers snap onto the series line so the chart stays honest; +hovering a marker shows trace metadata (service, span, duration, status) from a +configurable exemplar trace source, with a button to open the trace directly. + +For structured metric sources, exemplars are read directly from the OTel metric +tables' `Exemplars.*` columns (`renderMetricExemplarsChartConfig`), honoring the +chart's time range, metric name, and filters. For PromQL sources backed by a +real Prometheus endpoint, the new `/v1/prometheus/query_exemplars` route proxies +to Prometheus's native `/api/v1/query_exemplars`. The overlay is opt-in and runs +its query in parallel only when enabled, so charts that don't use it are +unaffected. Trace-source exemplar generation lands in a follow-up. diff --git a/.changeset/span-metrics-connector.md b/.changeset/span-metrics-connector.md new file mode 100644 index 0000000000..002e942870 --- /dev/null +++ b/.changeset/span-metrics-connector.md @@ -0,0 +1,15 @@ +--- +"@hyperdx/api": minor +"@hyperdx/otel-collector": minor +--- + +feat: optional spanmetrics connector for metric exemplars + +Adds the `spanmetricsconnector` to the collector build and wires it into the +OpAMP-generated collector config, gated on the `ENABLE_SPAN_METRICS` env flag +(off by default). When enabled, the collector derives `traces.span.metrics.*` +(calls + duration histogram) from spans with **exemplars enabled**, so the +duration histogram lands in ClickHouse with `Exemplars.*` pointing back at the +spans they were measured from — giving coherent, fully-OTLP metric exemplars +without any direct ClickHouse writes. Enabled in local dev to back the new +`telemetry-generator` service. diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 95e882eca0..7ec6efc04a 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -41,6 +41,9 @@ services: OTEL_SUPERVISOR_LOGS: 'true' HYPERDX_OTEL_EXPORTER_TABLES_TTL: '24h' ENABLE_PROMQL: 'true' + # Remote-write the span-derived metrics (with exemplars) to the dev + # Prometheus so the native query_exemplars path is testable with real data. + SPAN_METRICS_PROM_RW_ENDPOINT: 'http://prometheus:9090/api/v1/write' volumes: - ./docker/otel-collector/config.yaml:/etc/otelcol-contrib/config.yaml - ./docker/otel-collector/supervisor_docker.yaml.tmpl:/etc/otel/supervisor.yaml.tmpl @@ -157,9 +160,15 @@ services: hdx.dev.service: prometheus hdx.dev.port: '${HDX_DEV_PROMETHEUS_PORT:-9090}' hdx.dev.url: 'http://localhost:${HDX_DEV_PROMETHEUS_PORT:-9090}' - profiles: - - prometheus image: prom/prometheus:latest + # exemplar-storage: query_exemplars data; remote-write receiver: ingest the + # collector's span-derived metrics (with exemplars) from real generated data. + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--enable-feature=exemplar-storage' + - '--web.enable-remote-write-receiver' + - '--web.enable-lifecycle' ports: - '${HDX_DEV_PROMETHEUS_PORT:-9090}:9090' volumes: @@ -169,5 +178,25 @@ services: - internal restart: on-failure + # Synthetic traces (via OTLP) + coherent metric exemplars (to ClickHouse) for + # local dev + e2e. See telemetry-generator/README.md. + telemetry-generator: + labels: + <<: *hdx-labels + hdx.dev.service: telemetry-generator + build: + context: ./telemetry-generator + environment: + OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318' + GEN_OTLP_API_KEY: '${INGESTION_API_KEY:-super-secure-ingestion-api-key}' + GEN_BACKFILL_MINUTES: '30' + GEN_RATE_PER_SEC: '20' + networks: + - internal + restart: on-failure + depends_on: + otel-collector: + condition: service_started + networks: internal: diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 94877ac65d..ff6643b21f 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -18,3 +18,7 @@ scrape_configs: static_configs: - targets: ['ch-server:9363'] metrics_path: '/metrics' + + # Note: the span-derived request metrics (with exemplars) arrive via + # remote-write from the OTel collector, not a scrape — see the collector's + # SPAN_METRICS_PROM_RW_ENDPOINT and the spanmetrics connector. diff --git a/knip.json b/knip.json index 02a9ae465a..d07bd933f4 100644 --- a/knip.json +++ b/knip.json @@ -36,7 +36,8 @@ "ignore": [ "scripts/dev-portal/**", ".github/scripts/**", - "docker/hyperdx/**" + "docker/hyperdx/**", + "telemetry-generator/**" ], "ignoreBinaries": ["make", "migrate"], "ignoreDependencies": [ diff --git a/packages/api/.env.development b/packages/api/.env.development index 8ba0a6c0c8..a8ec2bd9ca 100644 --- a/packages/api/.env.development +++ b/packages/api/.env.development @@ -20,9 +20,11 @@ REDIS_URL=redis://localhost:6379 USAGE_STATS_ENABLED=false NODE_OPTIONS="--max-http-header-size=131072" ENABLE_SWAGGER=true -DEFAULT_CONNECTIONS=[{"name":"Local ClickHouse","host":"http://localhost:${HDX_DEV_CH_HTTP_PORT}","username":"default","password":""}] -DEFAULT_SOURCES=[{"from":{"databaseName":"default","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","metadataMaterializedViews":{"keyRollupTable":"otel_logs_key_rollup_15m","kvRollupTable":"otel_logs_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","traceSourceId":"Traces","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","metadataMaterializedViews":{"keyRollupTable":"otel_traces_key_rollup_15m","kvRollupTable":"otel_traces_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","logSourceId":"Logs","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum","_id":"682586a8b1f81924e628e808","id":"682586a8b1f81924e628e808"},"connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","sessionSourceId":"Sessions"},{"from":{"databaseName":"default","tableName":"hyperdx_sessions"},"kind":"session","timestampValueExpression":"TimestampTime","name":"Sessions","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","metricSourceId":"Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"JSON Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","traceSourceId":"JSON Traces","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"JSON Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","connection":"Local ClickHouse","logSourceId":"JSON Logs","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"JSON Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum"},"connection":"Local ClickHouse","logSourceId":"JSON Logs","traceSourceId":"JSON Traces"}] +DEFAULT_CONNECTIONS=[{"name":"Local ClickHouse","host":"http://localhost:${HDX_DEV_CH_HTTP_PORT}","username":"default","password":""},{"name":"Local Prometheus","host":"http://localhost:${HDX_DEV_PROMETHEUS_PORT}","username":"","password":"","isPrometheusEndpoint":true}] +DEFAULT_SOURCES=[{"from":{"databaseName":"default","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","metadataMaterializedViews":{"keyRollupTable":"otel_logs_key_rollup_15m","kvRollupTable":"otel_logs_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","traceSourceId":"Traces","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","metadataMaterializedViews":{"keyRollupTable":"otel_traces_key_rollup_15m","kvRollupTable":"otel_traces_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","logSourceId":"Logs","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum","_id":"682586a8b1f81924e628e808","id":"682586a8b1f81924e628e808"},"connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","sessionSourceId":"Sessions"},{"from":{"databaseName":"default","tableName":"hyperdx_sessions"},"kind":"session","timestampValueExpression":"TimestampTime","name":"Sessions","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","metricSourceId":"Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"JSON Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","traceSourceId":"JSON Traces","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"JSON Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","connection":"Local ClickHouse","logSourceId":"JSON Logs","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"JSON Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum"},"connection":"Local ClickHouse","logSourceId":"JSON Logs","traceSourceId":"JSON Traces"},{"from":{"databaseName":"prometheus","tableName":"prometheus"},"kind":"promql","timestampValueExpression":"timestamp","name":"Prometheus","connection":"Local Prometheus","traceSourceId":"Traces"}] INGESTION_API_KEY="super-secure-ingestion-api-key" HYPERDX_API_KEY=$INGESTION_API_KEY ANTHROPIC_API_KEY="your-anthropic-api-key-here" ENABLE_PROMQL=true +ENABLE_SPAN_METRICS=true +ENABLE_SPAN_METRICS_PROM_RW=true diff --git a/packages/api/src/config.ts b/packages/api/src/config.ts index 6677989124..20bfff0b0c 100644 --- a/packages/api/src/config.ts +++ b/packages/api/src/config.ts @@ -51,6 +51,21 @@ export const DEFAULT_SOURCES = env.DEFAULT_SOURCES; export const IS_PROMQL_ENABLED = env.ENABLE_PROMQL === 'true'; +// Opt-in: have the collector derive request metrics (with trace exemplars) from +// spans via the spanmetrics connector. Off by default; enabled in dev so the +// telemetry-generator's traces produce coherent metric exemplars end-to-end. +export const IS_SPAN_METRICS_ENABLED = env.ENABLE_SPAN_METRICS === 'true'; + +// Opt-in: also remote-write the span-derived metrics (with exemplars) to a +// Prometheus endpoint so the native Prometheus query_exemplars path can be +// tested against real data. The endpoint is resolved here (API side) and +// inlined into the generated collector config, so the collector container does +// not need SPAN_METRICS_PROM_RW_ENDPOINT in its own environment. Requires the +// endpoint to be set; without it the feature stays disabled. +export const SPAN_METRICS_PROM_RW_ENDPOINT = env.SPAN_METRICS_PROM_RW_ENDPOINT; +export const IS_SPAN_METRICS_PROM_RW_ENABLED = + env.ENABLE_SPAN_METRICS_PROM_RW === 'true' && !!SPAN_METRICS_PROM_RW_ENDPOINT; + // FOR CI ONLY export const CLICKHOUSE_HOST = env.CLICKHOUSE_HOST as string; export const CLICKHOUSE_USER = env.CLICKHOUSE_USER as string; diff --git a/packages/api/src/models/team.ts b/packages/api/src/models/team.ts index 4d6b620162..259c695f2c 100644 --- a/packages/api/src/models/team.ts +++ b/packages/api/src/models/team.ts @@ -42,6 +42,7 @@ export default mongoose.model( fieldMetadataDisabled: Boolean, parallelizeWhenPossible: Boolean, filterKeysFetchLimit: Number, + maxExemplars: Number, }, { timestamps: true, diff --git a/packages/api/src/opamp/controllers/opampController.ts b/packages/api/src/opamp/controllers/opampController.ts index 5d726622a1..2774b385a4 100644 --- a/packages/api/src/opamp/controllers/opampController.ts +++ b/packages/api/src/opamp/controllers/opampController.ts @@ -82,6 +82,13 @@ type CollectorConfig = { pipelines: string[]; }>; }; + span_metrics?: { + histogram: { unit: string; explicit: { buckets: string[] } }; + dimensions: Array<{ name: string }>; + exemplars: { enabled: boolean }; + metrics_flush_interval: string; + namespace?: string; + }; }; exporters?: { nop?: null; @@ -132,6 +139,15 @@ type CollectorConfig = { enabled: boolean; }; }; + 'prometheusremotewrite/spanmetrics'?: { + endpoint: string; + tls: { + insecure: boolean; + }; + resource_to_telemetry_conversion: { + enabled: boolean; + }; + }; }; service: { extensions: string[]; @@ -322,6 +338,69 @@ export const buildOtelCollectorConfig = ( }; } + if ( + config.IS_SPAN_METRICS_ENABLED && + otelCollectorConfig.connectors && + otelCollectorConfig.exporters + ) { + // Derive request metrics (with trace exemplars) from spans. The connector + // consumes the traces pipeline and feeds a dedicated metrics pipeline, so + // the resulting `traces.span.metrics.*` land in ClickHouse with + // `Exemplars.*` pointing back at the spans they were measured from. + otelCollectorConfig.connectors.span_metrics = { + histogram: { + unit: 'ms', + explicit: { + buckets: [ + '2ms', + '5ms', + '10ms', + '25ms', + '50ms', + '100ms', + '250ms', + '500ms', + '1s', + '2.5s', + '5s', + '10s', + ], + }, + }, + dimensions: [ + { name: 'http.route' }, + { name: 'http.method' }, + { name: 'host.region' }, + { name: 'app.tenant_id' }, + { name: 'http.status_code' }, + ], + exemplars: { enabled: true }, + metrics_flush_interval: '15s', + }; + otelCollectorConfig.service.pipelines.traces.exporters.push( + 'span_metrics', + ); + + const spanMetricsExporters = ['clickhouse']; + // Optionally also remote-write the derived metrics (with exemplars) to a + // Prometheus endpoint, so the native Prometheus `query_exemplars` path can + // be exercised against the same real, generated data. + if (config.IS_SPAN_METRICS_PROM_RW_ENABLED) { + otelCollectorConfig.exporters['prometheusremotewrite/spanmetrics'] = { + // Guaranteed set by IS_SPAN_METRICS_PROM_RW_ENABLED above. + endpoint: config.SPAN_METRICS_PROM_RW_ENDPOINT!, + tls: { insecure: true }, + resource_to_telemetry_conversion: { enabled: true }, + }; + spanMetricsExporters.push('prometheusremotewrite/spanmetrics'); + } + otelCollectorConfig.service.pipelines['metrics/spanmetrics'] = { + receivers: ['span_metrics'], + processors: ['memory_limiter', 'batch'], + exporters: spanMetricsExporters, + }; + } + if (collectorAuthenticationEnforced) { if (otelCollectorConfig.receivers['otlp/hyperdx'] == null) { // should never happen diff --git a/packages/api/src/routers/api/__tests__/prometheus.integration.test.ts b/packages/api/src/routers/api/__tests__/prometheus.integration.test.ts index dc48dd8499..dcfee80a83 100644 --- a/packages/api/src/routers/api/__tests__/prometheus.integration.test.ts +++ b/packages/api/src/routers/api/__tests__/prometheus.integration.test.ts @@ -270,4 +270,62 @@ describe('prometheus router', () => { expect(calledUrl).toContain('/api/v1/label/__name__/values'); }); }); + + describe('GET /v1/prometheus/query_exemplars', () => { + it('returns 400 when query parameter is missing', async () => { + const { agent } = await getLoggedInAgent(server); + const res = await agent + .get('/v1/prometheus/query_exemplars') + .query({ connectionId: new Types.ObjectId().toString() }) + .expect(400); + expect(res.body).toMatchObject({ + status: 'error', + errorType: 'bad_data', + error: expect.stringContaining('query'), + }); + }); + + it('proxies to upstream Prometheus when connection isPrometheusEndpoint', async () => { + const { agent, team } = await getLoggedInAgent(server); + const conn = await seedPrometheusConnection(team._id); + + const promResponse = { status: 'success', data: [] }; + mockFetch.mockResolvedValueOnce( + fakeUpstreamResponse(promResponse) as any, + ); + + const res = await agent + .get('/v1/prometheus/query_exemplars') + .query({ + query: 'up', + start: '1700000000', + end: '1700000060', + connectionId: conn._id.toString(), + }) + .expect(200); + + expect(res.body).toEqual(promResponse); + const calledUrl = mockFetch.mock.calls[0][0] as string; + expect(calledUrl).toContain('/api/v1/query_exemplars'); + expect(calledUrl).toContain('query=up'); + }); + + it('returns an empty result for ClickHouse-backed connections (no native exemplar table function)', async () => { + const { agent, team } = await getLoggedInAgent(server); + const conn = await seedClickHouseConnection(team._id); + + const res = await agent + .get('/v1/prometheus/query_exemplars') + .query({ + query: 'up', + start: '1700000000', + end: '1700000060', + connectionId: conn._id.toString(), + }) + .expect(200); + + expect(res.body).toEqual({ status: 'success', data: [] }); + expect(mockFetch).not.toHaveBeenCalled(); + }); + }); }); diff --git a/packages/api/src/routers/api/prometheus.ts b/packages/api/src/routers/api/prometheus.ts index 07f446816f..ad35357206 100644 --- a/packages/api/src/routers/api/prometheus.ts +++ b/packages/api/src/routers/api/prometheus.ts @@ -478,6 +478,86 @@ const queryHandler: express.RequestHandler = async (req, res) => { router.get('/query', queryHandler); router.post('/query', queryHandler); +// -------------------------- +// GET|POST /query_exemplars +// -------------------------- + +// Native Prometheus exposes exemplars via /api/v1/query_exemplars. We proxy +// straight through for Prometheus-backed connections. ClickHouse-backed metric +// exemplars are read directly from the OTel metric tables' `Exemplars.*` +// columns in the app (via renderMetricExemplarsChartConfig), so there is no +// ClickHouse table function to call here — return an empty result instead. +const queryExemplarsHandler: express.RequestHandler = async (req, res) => { + const startedAt = performance.now(); + let backend: PrometheusBackend = 'unknown'; + try { + const { teamId } = getNonNullUserWithTeam(req); + const params = getParams(req); + + const query = params.query; + if (!query) { + return res.status(400).json({ + status: 'error', + errorType: 'bad_data', + error: 'missing required parameter: query', + }); + } + + const connectionId = params.connectionId; + if (!connectionId) { + return res.status(400).json({ + status: 'error', + errorType: 'bad_data', + error: 'missing required parameter: connectionId', + }); + } + + const connection = await getConnectionById( + teamId.toString(), + connectionId, + true, + ); + if (!connection) { + return res.status(404).json({ + status: 'error', + errorType: 'bad_data', + error: 'Connection not found', + }); + } + + if (connection.isPrometheusEndpoint) { + backend = 'prometheus'; + await proxyToPrometheus( + connection.host, + '/api/v1/query_exemplars', + params, + res, + ); + return; + } + + // ClickHouse-backed PromQL: no native exemplar table function. Exemplars + // for structured metric charts are fetched app-side from the metric table. + backend = 'clickhouse'; + return res.json({ status: 'success', data: [] }); + } catch (e) { + prometheusQueryErrors.add(1, { endpoint: 'query_exemplars', backend }); + logger.error(e, 'Prometheus query_exemplars error'); + return res.status(400).json({ + status: 'error', + errorType: 'bad_data', + error: e instanceof Error ? e.message : String(e), + }); + } finally { + prometheusQueryDuration.record(performance.now() - startedAt, { + endpoint: 'query_exemplars', + backend, + }); + } +}; +router.get('/query_exemplars', queryExemplarsHandler); +router.post('/query_exemplars', queryExemplarsHandler); + // -------------------------- // GET /label/:name/values // -------------------------- diff --git a/packages/app/src/HDXMultiSeriesTimeChart.tsx b/packages/app/src/HDXMultiSeriesTimeChart.tsx index 06b2c1a61f..878fc6489a 100644 --- a/packages/app/src/HDXMultiSeriesTimeChart.tsx +++ b/packages/app/src/HDXMultiSeriesTimeChart.tsx @@ -11,6 +11,7 @@ import { CartesianGrid, Legend, ReferenceArea, + ReferenceDot, ReferenceLine, ResponsiveContainer, Tooltip, @@ -19,7 +20,7 @@ import { } from 'recharts'; import { AxisDomain } from 'recharts/types/util/types'; import { convertGranularityToSeconds } from '@hyperdx/common-utils/dist/core/utils'; -import { DisplayType } from '@hyperdx/common-utils/dist/types'; +import { DisplayType, Exemplar } from '@hyperdx/common-utils/dist/types'; import { Popover } from '@mantine/core'; import type { NumberFormat } from '@/types'; @@ -428,6 +429,39 @@ function CaptureActiveDot({ ); } +function numOrNull(v: unknown): number | null { + return typeof v === 'number' && !isNaN(v) ? v : null; +} + +/** + * Diamond marker for an exemplar, drawn via . + * Recharts injects cx/cy. Hovering opens a floating menu (handled by the parent + * via onHoverStart/onHoverEnd) to inspect the linked trace — the marker itself + * is not a click target. A larger transparent hit circle eases hovering. + */ +function ExemplarDot(props: any) { + const { cx, cy, exemplar, onHoverStart, onHoverEnd } = props; + if (typeof cx !== 'number' || typeof cy !== 'number') { + return null; + } + const s = 4; + return ( + onHoverStart?.(exemplar, cx, cy)} + onMouseLeave={() => onHoverEnd?.()} + > + + + + ); +} + /** * Compute the unique set of hexes referenced by `` defs * inside MemoChart. Exported so a unit test can pin the dedup-and-union @@ -476,6 +510,10 @@ export const MemoChart = memo(function MemoChart({ granularity, dateRangeEndInclusive = true, fitYAxisToData = false, + exemplars, + maxExemplars = 12, + onExemplarHover, + onExemplarHoverEnd, }: { graphResults: any[]; setIsClickActive: (v: any) => void; @@ -502,6 +540,14 @@ export const MemoChart = memo(function MemoChart({ * (with padding) instead of zero. **/ fitYAxisToData?: boolean; + /** Exemplar markers to overlay on the chart (linked to traces). */ + exemplars?: Exemplar[]; + /** Target number of exemplar markers to show (0 = unlimited). */ + maxExemplars?: number; + /** Invoked when the cursor enters an exemplar marker, with its pixel coords. */ + onExemplarHover?: (exemplar: Exemplar, cx: number, cy: number) => void; + /** Invoked when the cursor leaves an exemplar marker. */ + onExemplarHoverEnd?: () => void; }) { const _id = useId(); const id = _id.replace(/:/g, ''); @@ -610,17 +656,39 @@ export const MemoChart = memo(function MemoChart({ const shouldFitYAxis = fitYAxisToData && displayType !== DisplayType.StackedBar; - // The data min/max is only needed to either zoom into a selection or to - // fit the lower bound to the data. When neither applies, let Recharts - // auto-calculate the upper bound while pinning the lower bound to zero. + // Exemplars plot at the trace's own value, which can exceed the series + // (a slow request above the avg line), so they must influence the upper + // bound or they'd be clipped off-chart. + const exemplarValues = (exemplars ?? []) + .map(e => e.value) + .filter((v): v is number => typeof v === 'number' && !isNaN(v)); + const exemplarMax = exemplarValues.length + ? Math.max(...exemplarValues) + : -Infinity; + + // The data min/max is only needed to either zoom into a selection, fit the + // lower bound to the data, or make room for exemplar markers. When none + // apply, let Recharts auto-calculate the upper bound (lower pinned to zero). if (!hasSelection && !shouldFitYAxis) { - return [0, 'auto']; + if (exemplarMax === -Infinity) return [0, 'auto']; + // Need an explicit upper bound to include exemplars; derive it from the + // visible series max and the exemplar max. + let seriesMax = -Infinity; + graphResults.forEach(dataPoint => { + lineData.forEach(ld => { + const value = dataPoint[ld.dataKey]; + if (typeof value === 'number' && !isNaN(value)) { + seriesMax = Math.max(seriesMax, value); + } + }); + }); + return [0, Math.max(seriesMax, exemplarMax) * 1.05]; } // Calculate domain based on visible series (all series when there's no // explicit selection). let minValue = Infinity; - let maxValue = -Infinity; + let maxValue = exemplarMax; graphResults.forEach(dataPoint => { lineData.forEach(ld => { @@ -653,6 +721,7 @@ export const MemoChart = memo(function MemoChart({ return ['auto', 'auto']; }, [ + exemplars, graphResults, lineData, selectedSeriesNames, @@ -721,6 +790,61 @@ export const MemoChart = memo(function MemoChart({ return map; }, [lineData]); + // Place each exemplar at its own value (the trace/span's actual measurement), + // never remapped onto the series line — the marker's height must match what + // the linked trace reports. Thinned to keep ~maxExemplars markers across the + // visible range: the highest-value (most notable, e.g. slowest) trace per + // window. The window is coarser than the chart granularity so the count stays + // readable even when every fine-grained bucket has an exemplar. + // maxExemplars <= 0 means "unlimited" — show every exemplar (deduped). + const exemplarPoints = useMemo(() => { + type ExemplarPoint = { + x: number; + y: number; + exemplar: Exemplar; + key: string; + }; + if (!exemplars?.length) return [] as ExemplarPoint[]; + + const toPoint = (exemplar: Exemplar, value: number): ExemplarPoint => ({ + x: exemplar.timestamp / 1000, // ms -> seconds (chart x unit) + y: value, + exemplar, + key: `exemplar-${exemplar.traceId}-${exemplar.timestamp}`, + }); + + if (maxExemplars <= 0) { + const all = new Map(); + for (const exemplar of exemplars) { + const value = numOrNull(exemplar.value); + if (value == null) continue; + const p = toPoint(exemplar, value); + all.set(p.key, p); // dedupe identical trace+time + } + return Array.from(all.values()); + } + + const granMs = convertGranularityToSeconds(granularity) * 1000; + const rangeMs = dateRange[1].getTime() - dateRange[0].getTime(); + const bucketMs = Math.max( + granMs || 1, + rangeMs > 0 ? Math.floor(rangeMs / maxExemplars) : granMs || 1, + ); + + const bestPerBucket = new Map(); + for (const exemplar of exemplars) { + const value = numOrNull(exemplar.value); + if (value == null) continue; + const bucket = Math.floor(exemplar.timestamp / bucketMs); + const key = `${exemplar.groupKey ?? ''}@${bucket}`; + const existing = bestPerBucket.get(key); + if (!existing || value > existing.y) { + bestPerBucket.set(key, toPoint(exemplar, value)); + } + } + return Array.from(bestPerBucket.values()); + }, [exemplars, maxExemplars, granularity, dateRange]); + const xAxisDomain: AxisDomain = useMemo(() => { let startTime = toStartOfInterval(dateRange[0], granularity); let endTime = toStartOfInterval(dateRange[1], granularity); @@ -946,6 +1070,21 @@ export const MemoChart = memo(function MemoChart({ /> )} {referenceLines} + {exemplarPoints.map(p => ( + + } + /> + ))} {highlightStart && highlightEnd ? ( ({ }, })); +// Exemplar hooks need a QueryClientProvider (via useMetadataWithSettings) and +// are irrelevant to queryKey consistency, so stub them out. +jest.mock('@/hooks/useExemplars', () => ({ + useExemplars: () => ({ exemplars: [], isLoading: false, isError: false }), + useExemplarTraceMeta: () => ({ data: null, isLoading: false }), +})); + jest.mock('@/hooks/useMVOptimizationExplanation', () => ({ useMVOptimizationExplanation: jest.fn().mockReturnValue({ data: undefined, diff --git a/packages/app/src/api.ts b/packages/app/src/api.ts index a79c782d5b..ddcef2ab37 100644 --- a/packages/app/src/api.ts +++ b/packages/app/src/api.ts @@ -509,6 +509,18 @@ type PrometheusLabelValuesResponse = { data?: string[]; error?: string; }; +// Native Prometheus /query_exemplars shape: one entry per series, each with its +// own exemplar list. `labels` carries the trace/span id (label naming varies by +// exporter, e.g. trace_id vs traceID — normalized downstream). +type PrometheusExemplarsResult = { + seriesLabels: PrometheusMetric; + exemplars: { labels: PrometheusMetric; value: string; timestamp: number }[]; +}; +type PrometheusQueryExemplarsResponse = { + status: 'success' | 'error'; + data?: PrometheusExemplarsResult[]; + error?: string; +}; async function prometheusFetch( path: string, @@ -554,6 +566,23 @@ export const prometheusApi = { ...(params.table ? { table: params.table } : {}), }), + queryExemplars: (params: { + query: string; + start: number; + end: number; + connectionId: string; + database?: string; + table?: string; + }): Promise => + prometheusFetch('v1/prometheus/query_exemplars', { + query: params.query, + start: String(params.start), + end: String(params.end), + connectionId: params.connectionId, + ...(params.database ? { database: params.database } : {}), + ...(params.table ? { table: params.table } : {}), + }), + labelValues: (params: { label: string; connectionId: string; diff --git a/packages/app/src/components/ChartEditor/PromqlChartEditor.tsx b/packages/app/src/components/ChartEditor/PromqlChartEditor.tsx index 1b6a7b346d..c5c592a562 100644 --- a/packages/app/src/components/ChartEditor/PromqlChartEditor.tsx +++ b/packages/app/src/components/ChartEditor/PromqlChartEditor.tsx @@ -1,6 +1,6 @@ import { Control, useController, useWatch } from 'react-hook-form'; import { SourceKind } from '@hyperdx/common-utils/dist/types'; -import { Box, Button, Flex, Stack, Text } from '@mantine/core'; +import { Box, Button, Flex, Stack, Switch, Text } from '@mantine/core'; import PromQLEditor from '@/components/PromQLEditor/PromQLEditor'; import { SourceSelectControlled } from '@/components/SourceSelect'; @@ -22,6 +22,10 @@ export default function PromqlChartEditor({ control, name: 'promqlExpression', }); + const { field: exemplarsField } = useController({ + control, + name: 'enableExemplars', + }); const sourceId = useWatch({ control, name: 'source' }); const { data: source } = useSource({ id: sourceId }); @@ -57,7 +61,33 @@ export default function PromqlChartEditor({ metricNames={metricNames} /> - + + + { + exemplarsField.onChange(exemplarsField.value !== true); + onSubmit(); + }} + /> + {exemplarsField.value === true && ( + + + Trace source + + + + )} + + + + + ); +} + function ActiveTimeTooltip({ activeClickPayload, buildSearchUrl, @@ -367,6 +498,69 @@ function DBTimeChartComponent({ id: sourceId || config.source, }); + // Exemplar overlay is configured per-chart via `enableExemplars` (set in the + // chart editor next to "As Ratio"), not a runtime toolbar toggle. The hook is + // a no-op unless the flag is set and the source kind supports exemplars. + const { exemplars } = useExemplars(queriedConfig, source); + + // Trace source an exemplar resolves against: the chart's explicit + // `exemplarTraceSourceId`, else the chart source's linked trace source. + const exemplarTraceSourceId = + queriedConfig.exemplarTraceSourceId || + (source as { traceSourceId?: string } | undefined)?.traceSourceId; + const { data: exemplarTraceSource } = useSource({ + id: exemplarTraceSourceId, + }); + + // Hover card state. A short close delay lets the cursor travel from the SVG + // marker into the HTML card without it closing. + const [hoveredExemplar, setHoveredExemplar] = useState<{ + exemplar: Exemplar; + x: number; + y: number; + } | null>(null); + const exemplarCloseTimer = useRef | null>(null); + const openExemplarCard = useCallback( + (exemplar: Exemplar, x: number, y: number) => { + if (exemplarCloseTimer.current) clearTimeout(exemplarCloseTimer.current); + setHoveredExemplar({ exemplar, x, y }); + }, + [], + ); + const scheduleCloseExemplarCard = useCallback(() => { + if (exemplarCloseTimer.current) clearTimeout(exemplarCloseTimer.current); + exemplarCloseTimer.current = setTimeout( + () => setHoveredExemplar(null), + 150, + ); + }, []); + useEffect( + () => () => { + if (exemplarCloseTimer.current) clearTimeout(exemplarCloseTimer.current); + }, + [], + ); + + const { data: hoveredTraceMeta, isLoading: isHoveredTraceMetaLoading } = + useExemplarTraceMeta( + hoveredExemplar?.exemplar.traceId, + exemplarTraceSource, + ); + + const navigateToExemplarTrace = useCallback( + (exemplar: Exemplar) => { + if (exemplarTraceSourceId) { + const params = new URLSearchParams(); + params.set('source', exemplarTraceSourceId); + params.set('traceId', exemplar.traceId); + Router.push(`/search?${params.toString()}`); + } else { + Router.push(`/trace/${encodeURIComponent(exemplar.traceId)}`); + } + }, + [exemplarTraceSourceId], + ); + const { formatByColumn, chartFormat: axisNumberFormat } = useChartNumberFormats(queriedConfig, data?.meta); @@ -720,6 +914,18 @@ function DBTimeChartComponent({ buildSearchUrl={buildSearchUrl} onDismiss={() => setActiveClickPayload(undefined)} /> + { + if (exemplarCloseTimer.current) + clearTimeout(exemplarCloseTimer.current); + }} + onMouseLeave={scheduleCloseExemplarCard} + /> )} diff --git a/packages/app/src/components/TeamSettings/TeamQueryConfigSection.tsx b/packages/app/src/components/TeamSettings/TeamQueryConfigSection.tsx index de850d2d6f..5d3af8715d 100644 --- a/packages/app/src/components/TeamSettings/TeamQueryConfigSection.tsx +++ b/packages/app/src/components/TeamSettings/TeamQueryConfigSection.tsx @@ -22,6 +22,7 @@ import SelectControlled from '@/components/SelectControlled'; import { DEFAULT_FILTER_KEYS_FETCH_LIMIT, DEFAULT_FILTER_KEYS_FETCH_LIMIT_WITH_MVS, + DEFAULT_MAX_EXEMPLARS, DEFAULT_QUERY_TIMEOUT, DEFAULT_SEARCH_ROW_LIMIT, } from '@/defaults'; @@ -345,6 +346,26 @@ export default function TeamQueryConfigSection() { /> + + + Chart Settings + + + + + + + ); } diff --git a/packages/app/src/components/__tests__/DBTimeChart.test.tsx b/packages/app/src/components/__tests__/DBTimeChart.test.tsx index 6d851cffe2..beaa9b7b95 100644 --- a/packages/app/src/components/__tests__/DBTimeChart.test.tsx +++ b/packages/app/src/components/__tests__/DBTimeChart.test.tsx @@ -35,6 +35,15 @@ jest.mock('@/source', () => ({ .mockReturnValue({ formatByColumn: new Map(), chartFormat: undefined }), })); +jest.mock('@/hooks/useExemplars', () => ({ + useExemplars: jest + .fn() + .mockReturnValue({ exemplars: [], isLoading: false, isError: false }), + useExemplarTraceMeta: jest + .fn() + .mockReturnValue({ data: null, isLoading: false }), +})); + jest.mock('../MaterializedViews/MVOptimizationIndicator', () => jest.fn(() => null), ); diff --git a/packages/app/src/defaults.ts b/packages/app/src/defaults.ts index c3537c5911..57f94e7139 100644 --- a/packages/app/src/defaults.ts +++ b/packages/app/src/defaults.ts @@ -6,6 +6,8 @@ export const DEFAULT_QUERY_TIMEOUT = 60; // max_execution_time, seconds export const DEFAULT_FILTER_KEYS_FETCH_LIMIT = 20; export const DEFAULT_FILTER_KEYS_FETCH_LIMIT_WITH_MVS = 100; export const DEFAULT_SERIES_LIMIT = 100; +// Target number of exemplar markers shown per chart (0 = unlimited). +export const DEFAULT_MAX_EXEMPLARS = 12; export function searchChartConfigDefaults( team: any | undefined | null, diff --git a/packages/app/src/hooks/__tests__/useExemplars.test.ts b/packages/app/src/hooks/__tests__/useExemplars.test.ts new file mode 100644 index 0000000000..7a0ac340cc --- /dev/null +++ b/packages/app/src/hooks/__tests__/useExemplars.test.ts @@ -0,0 +1,61 @@ +import { normalizePrometheusExemplars } from '@/hooks/useExemplars'; + +describe('normalizePrometheusExemplars', () => { + it('returns [] for undefined/empty input', () => { + expect(normalizePrometheusExemplars(undefined)).toEqual([]); + expect(normalizePrometheusExemplars([])).toEqual([]); + }); + + it('maps trace/span ids, value, and seconds→ms timestamp', () => { + const result = normalizePrometheusExemplars([ + { + seriesLabels: { __name__: 'http_latency', service: 'api' }, + exemplars: [ + { + labels: { trace_id: 'abc', span_id: 'def' }, + value: '1.5', + timestamp: 1700000000, + }, + ], + }, + ]); + expect(result).toEqual([ + { + timestamp: 1700000000 * 1000, + value: 1.5, + traceId: 'abc', + spanId: 'def', + groupKey: 'service="api"', + }, + ]); + }); + + it('accepts alternate label spellings (traceID/spanID)', () => { + const [ex] = normalizePrometheusExemplars([ + { + seriesLabels: {}, + exemplars: [ + { + labels: { traceID: 'xyz', spanID: 's1' }, + value: '2', + timestamp: 1, + }, + ], + }, + ]); + expect(ex.traceId).toBe('xyz'); + expect(ex.spanId).toBe('s1'); + expect(ex.groupKey).toBeUndefined(); + }); + + it('skips exemplars without a trace id', () => { + expect( + normalizePrometheusExemplars([ + { + seriesLabels: {}, + exemplars: [{ labels: { foo: 'bar' }, value: '1', timestamp: 1 }], + }, + ]), + ).toEqual([]); + }); +}); diff --git a/packages/app/src/hooks/useExemplars.tsx b/packages/app/src/hooks/useExemplars.tsx new file mode 100644 index 0000000000..bf5260f198 --- /dev/null +++ b/packages/app/src/hooks/useExemplars.tsx @@ -0,0 +1,213 @@ +import { renderMetricExemplarsChartConfig } from '@hyperdx/common-utils/dist/core/renderChartConfig'; +import { isPromqlChartConfig } from '@hyperdx/common-utils/dist/guards'; +import { + ChartConfigWithOptDateRange, + Exemplar, + SourceKind, + TSource, +} from '@hyperdx/common-utils/dist/types'; +import { useQuery } from '@tanstack/react-query'; + +import { prometheusApi } from '@/api'; +import { useClickhouseClient } from '@/clickhouse'; +import { useMetadataWithSettings } from '@/hooks/useMetadata'; +import { getDurationMsExpression } from '@/source'; + +// Source kinds that can produce exemplars today: native metric/promql sources. +// Trace-generated exemplars are added in a follow-up. +const EXEMPLAR_SUPPORTED_KINDS: SourceKind[] = [ + SourceKind.Metric, + SourceKind.Promql, +]; + +// Native Prometheus exporters disagree on the trace/span id label name; accept +// the common spellings. +const TRACE_ID_LABELS = ['trace_id', 'traceID', 'traceId', 'trace.id']; +const SPAN_ID_LABELS = ['span_id', 'spanID', 'spanId', 'span.id']; + +function pick(labels: Record, keys: string[]) { + for (const k of keys) { + if (labels[k]) return labels[k]; + } + return undefined; +} + +/** + * Normalize a native Prometheus /query_exemplars response into the shared + * Exemplar shape. Exported for testing — label naming varies by exporter. + */ +export function normalizePrometheusExemplars( + data: + | { + seriesLabels: Record; + exemplars: { + labels: Record; + value: string; + timestamp: number; + }[]; + }[] + | undefined, +): Exemplar[] { + if (!data) return []; + const out: Exemplar[] = []; + for (const series of data) { + const seriesLabels = series.seriesLabels ?? {}; + const groupKey = + Object.entries(seriesLabels) + .filter(([k]) => k !== '__name__') + .map(([k, v]) => `${k}="${v}"`) + .join(', ') || undefined; + for (const ex of series.exemplars ?? []) { + const traceId = pick(ex.labels ?? {}, TRACE_ID_LABELS); + if (!traceId) continue; + out.push({ + timestamp: ex.timestamp * 1000, // prometheus exemplar ts is unix seconds + value: Number(ex.value), + traceId, + spanId: pick(ex.labels ?? {}, SPAN_ID_LABELS), + groupKey, + }); + } + } + return out; +} + +/** Map raw ClickHouse exemplar rows (renderMetricExemplarsChartConfig) → Exemplar[]. */ +function mapClickhouseExemplars(rows: Record[]): Exemplar[] { + return rows + .filter(r => r.traceId) + .map(r => ({ + timestamp: Number(r.timestamp), + value: Number(r.value), + traceId: String(r.traceId), + spanId: r.spanId ? String(r.spanId) : undefined, + })); +} + +/** + * Fetches exemplars for a chart in parallel with the main series query. A no-op + * (disabled query) unless `config.enableExemplars` is set and the source kind + * supports exemplars, so it adds zero cost to charts that don't use the overlay. + */ +export function useExemplars( + config: ChartConfigWithOptDateRange, + source: TSource | undefined, +) { + const clickhouseClient = useClickhouseClient(); + const metadata = useMetadataWithSettings(); + + const supported = !!source && EXEMPLAR_SUPPORTED_KINDS.includes(source.kind); + const enabled = config.enableExemplars === true && supported; + + const query = useQuery({ + queryKey: ['exemplars', config], + queryFn: async context => { + // PromQL → native Prometheus exemplars via the API proxy. + if (isPromqlChartConfig(config) && config.dateRange) { + const [startDate, endDate] = config.dateRange; + const resp = await prometheusApi.queryExemplars({ + query: config.promqlExpression, + start: startDate.getTime() / 1000, + end: endDate.getTime() / 1000, + connectionId: config.connection, + database: config.from?.databaseName, + table: config.from?.tableName, + }); + if (resp.status !== 'success') { + throw new Error(resp.error ?? 'query_exemplars failed'); + } + return normalizePrometheusExemplars(resp.data); + } + + // Structured metric source → exemplars stored on the OTel metric table. + const exemplarSql = await renderMetricExemplarsChartConfig( + config, + metadata, + ); + if (!exemplarSql) return []; + + const resp = await clickhouseClient.query({ + query: exemplarSql.sql, + query_params: exemplarSql.params, + format: 'JSON', + abort_signal: context.signal, + connectionId: config.connection, + }); + const json = await resp.json>(); + return mapClickhouseExemplars(json.data ?? []); + }, + enabled, + retry: 1, + refetchOnWindowFocus: false, + }); + + return { + exemplars: enabled ? (query.data ?? []) : [], + isLoading: query.isLoading, + isError: query.isError, + }; +} + +export type ExemplarTraceMeta = { + service?: string; + spanName?: string; + statusCode?: string; + durationMs?: number; + timestamp?: string; +}; + +/** + * Fetches a one-row summary of a trace (root/first span) from the given trace + * source, for the exemplar hover card. Enabled only while a trace id is hovered + * and a trace source is configured. + */ +export function useExemplarTraceMeta( + traceId: string | undefined, + traceSource: TSource | undefined, +) { + const clickhouseClient = useClickhouseClient(); + const isTrace = !!traceSource && traceSource.kind === SourceKind.Trace; + + return useQuery({ + queryKey: ['exemplarTraceMeta', traceId, traceSource?.id], + enabled: !!traceId && isTrace, + staleTime: 5 * 60 * 1000, + queryFn: async context => { + if (!traceId || !traceSource || traceSource.kind !== SourceKind.Trace) { + return null; + } + const s = traceSource; + const from = s.from.databaseName + ? `\`${s.from.databaseName}\`.\`${s.from.tableName}\`` + : `\`${s.from.tableName}\``; + const traceIdExpr = s.traceIdExpression || 'TraceId'; + const parentExpr = s.parentSpanIdExpression || 'ParentSpanId'; + const tsExpr = s.timestampValueExpression || 'Timestamp'; + const sql = ` + SELECT + ${s.serviceNameExpression || 'ServiceName'} AS service, + ${s.spanNameExpression || 'SpanName'} AS spanName, + ${s.statusCodeExpression || 'StatusCode'} AS statusCode, + ${getDurationMsExpression(s)} AS durationMs, + ${tsExpr} AS timestamp + FROM ${from} + WHERE ${traceIdExpr} = {traceId:String} + ORDER BY (${parentExpr} = '') DESC, ${tsExpr} ASC + LIMIT 1`; + const resp = await clickhouseClient.query({ + query: sql, + query_params: { traceId }, + format: 'JSON', + abort_signal: context.signal, + connectionId: s.connection, + }); + const json = await resp.json(); + const row = json.data?.[0]; + if (!row) return null; + return { + ...row, + durationMs: row.durationMs != null ? Number(row.durationMs) : undefined, + }; + }, + }); +} diff --git a/packages/common-utils/src/__tests__/renderChartConfig.test.ts b/packages/common-utils/src/__tests__/renderChartConfig.test.ts index 9159687546..8c480634fd 100644 --- a/packages/common-utils/src/__tests__/renderChartConfig.test.ts +++ b/packages/common-utils/src/__tests__/renderChartConfig.test.ts @@ -2,7 +2,9 @@ import { chSql, ColumnMeta, parameterizedQueryToSql } from '@/clickhouse'; import { Metadata } from '@/core/metadata'; import { ChartConfigWithOptDateRangeEx, + EXEMPLAR_QUERY_LIMIT, renderChartConfig, + renderMetricExemplarsChartConfig, timeFilterExpr, } from '@/core/renderChartConfig'; import { @@ -3436,3 +3438,140 @@ describe('renderChartConfig', () => { // gauge / sum / histogram snapshots earlier in this file plus the // cross-scope integration test in packages/api/src/clickhouse/__tests__. }); + +describe('renderMetricExemplarsChartConfig', () => { + let mockMetadata: jest.Mocked; + + beforeAll(() => { + jest.spyOn(console, 'warn').mockImplementation(() => {}); + jest.spyOn(console, 'error').mockImplementation(() => {}); + }); + afterAll(() => { + jest.restoreAllMocks(); + }); + + beforeEach(() => { + mockMetadata = { + getColumns: jest.fn().mockResolvedValue([]), + getMaterializedColumnsLookupTable: jest.fn().mockResolvedValue(null), + getColumn: jest.fn().mockResolvedValue(undefined), + getTableMetadata: jest + .fn() + .mockResolvedValue({ primary_key: 'TimeUnix' }), + getSkipIndices: jest.fn().mockResolvedValue([]), + getSetting: jest.fn().mockResolvedValue(undefined), + isClickHouseCloud: jest.fn().mockResolvedValue(false), + } as unknown as jest.Mocked; + }); + + const histogramConfig: ChartConfigWithOptDateRange = { + displayType: DisplayType.Line, + connection: 'test-connection', + metricTables: { + gauge: 'otel_metrics_gauge', + histogram: 'otel_metrics_histogram', + sum: 'otel_metrics_sum', + summary: 'otel_metrics_summary', + 'exponential histogram': 'otel_metrics_exponential_histogram', + }, + from: { databaseName: 'default', tableName: '' }, + select: [ + { + aggFn: 'quantile', + aggCondition: '', + aggConditionLanguage: 'lucene', + valueExpression: 'Value', + level: 0.95, + metricName: 'http.server.duration', + metricType: MetricsDataType.Histogram, + }, + ], + where: '', + whereLanguage: 'lucene', + timestampValueExpression: 'TimeUnix', + dateRange: [new Date('2025-02-12'), new Date('2025-02-14')], + granularity: '1 minute', + }; + + it('builds an ARRAY JOIN exemplar query against the metric-type table', async () => { + const generated = await renderMetricExemplarsChartConfig( + histogramConfig, + mockMetadata, + ); + expect(generated).not.toBeNull(); + const sql = parameterizedQueryToSql(generated!); + + // Surfaces the exemplar columns + expect(sql).toContain('ARRAY JOIN'); + expect(sql).toContain('`Exemplars.TraceId` AS ex_TraceId'); + expect(sql).toContain('toUnixTimestamp64Milli(ex_TimeUnix)'); + // Points at the histogram table and filters by metric name + time + expect(sql).toContain('otel_metrics_histogram'); + expect(sql).toContain("MetricName = 'http.server.duration'"); + expect(sql).toContain('TimeUnix'); + // Drops empty trace ids and caps the result set + expect(sql).toContain('notEmpty(ex_TraceId)'); + expect(sql).toContain(`LIMIT ${EXEMPLAR_QUERY_LIMIT}`); + }); + + it("scopes exemplars to the series' aggCondition so markers match the plotted line", async () => { + const filteredConfig = { + ...histogramConfig, + select: [ + { + aggFn: 'quantile', + level: 0.95, + valueExpression: 'Value', + metricName: 'http.server.duration', + metricType: MetricsDataType.Histogram, + aggCondition: "ServiceName = 'api'", + aggConditionLanguage: 'sql', + }, + ], + } as ChartConfigWithOptDateRange; + const generated = await renderMetricExemplarsChartConfig( + filteredConfig, + mockMetadata, + ); + expect(generated).not.toBeNull(); + const sql = parameterizedQueryToSql(generated!); + expect(sql).toContain("ServiceName = 'api'"); + }); + + it('returns null for a ratio config (exemplars are meaningless on a ratio axis)', async () => { + const ratioConfig = { + ...histogramConfig, + seriesReturnType: 'ratio', + select: [histogramConfig.select[0], histogramConfig.select[0]], + } as ChartConfigWithOptDateRange; + expect( + await renderMetricExemplarsChartConfig(ratioConfig, mockMetadata), + ).toBeNull(); + }); + + it('returns null for a multi-series config', async () => { + const multiConfig = { + ...histogramConfig, + select: [histogramConfig.select[0], histogramConfig.select[0]], + } as ChartConfigWithOptDateRange; + expect( + await renderMetricExemplarsChartConfig(multiConfig, mockMetadata), + ).toBeNull(); + }); + + it('returns null for a non-metric config', async () => { + const logConfig: ChartConfigWithOptDateRange = { + displayType: DisplayType.Line, + connection: 'test-connection', + from: { databaseName: 'default', tableName: 'otel_logs' }, + select: [{ aggFn: 'count', valueExpression: '' }], + where: '', + timestampValueExpression: 'Timestamp', + dateRange: [new Date('2025-02-12'), new Date('2025-02-14')], + granularity: '1 minute', + }; + expect( + await renderMetricExemplarsChartConfig(logConfig, mockMetadata), + ).toBeNull(); + }); +}); diff --git a/packages/common-utils/src/core/renderChartConfig.ts b/packages/common-utils/src/core/renderChartConfig.ts index 4f5c8a48d5..4c695b06de 100644 --- a/packages/common-utils/src/core/renderChartConfig.ts +++ b/packages/common-utils/src/core/renderChartConfig.ts @@ -2178,6 +2178,95 @@ export async function renderChartConfig( ]); } +/** Overall cap on exemplar markers returned for a single chart, so a wide + * time range can't flood the chart overlay with thousands of points. */ +export const EXEMPLAR_QUERY_LIMIT = 200; + +/** + * Builds a ClickHouse query that surfaces native exemplars stored on an OTel + * metric table (`Exemplars.TraceId/SpanId/Value/TimeUnix`). Returns null when + * the config is not a single-metric chart we can resolve a table for. + * + * Reuses `renderWhere` so the exemplar scan honors the exact same time range, + * metric-name, and user filters as the rendered series. Exemplars are kept as + * their own raw points (the marker sits at the exemplar's own value/time), not + * bucketed — so no `timeBucketExpr` here. + */ +export async function renderMetricExemplarsChartConfig( + chartConfig: ChartConfigWithOptDateRangeEx, + metadata: Metadata, +): Promise { + if ( + isRawSqlChartConfig(chartConfig) || + isPromqlChartConfig(chartConfig) || + !isMetricChartConfig(chartConfig) || + !Array.isArray(chartConfig.select) || + // Exemplars carry a single series' raw measurement (e.g. latency). They are + // meaningless on a ratio axis and ambiguous across multiple series, so only + // surface them for a single, non-ratio metric series. + chartConfig.select.length !== 1 || + chartConfig.seriesReturnType === 'ratio' + ) { + return null; + } + const { metricTables, select } = chartConfig; + const { metricType, metricName, metricNameSql } = select[0] ?? {}; + const table = + metricType && metricTables ? metricTables[metricType] : undefined; + if (!metricType || !metricName || !table) { + return null; + } + // Keep exemplars to latency metrics for now: a histogram's exemplar value is a + // request duration, which shares the chart's y-axis unit. Other metric types + // (counts/gauges/rates) put exemplars on an incompatible scale. + if (metricType !== MetricsDataType.Histogram) { + return null; + } + + // Build a config that points at the concrete metric-type table and carries + // the metric-name predicate alongside the user filters, then let renderWhere + // assemble the time filter + filters exactly as the main query does. The + // guards above narrow chartConfig to the metric builder config, so no cast. + const whereConfig: BuilderChartConfigWithOptDateRangeEx = { + ...chartConfig, + from: { ...chartConfig.from, tableName: table }, + timestampValueExpression: + chartConfig.timestampValueExpression || DEFAULT_METRIC_TABLE_TIME_COLUMN, + // Keep the original select so renderWhere applies the series' aggCondition — + // otherwise the exemplar scan would surface traces from other series (e.g. + // other services/routes/tenants) that share the same metric name. + filters: [ + ...(chartConfig.filters ?? []), + { + type: 'sql', + condition: createMetricNameFilter(metricName, metricNameSql), + }, + ], + }; + + const where = await renderWhere(whereConfig, metadata); + const from = renderFrom({ from: whereConfig.from }); + + return concatChSql(' ', [ + chSql`SELECT + toUnixTimestamp64Milli(ex_TimeUnix) AS timestamp, + ex_Value AS value, + ex_TraceId AS traceId, + ex_SpanId AS spanId`, + chSql`FROM ${from}`, + chSql`ARRAY JOIN + \`Exemplars.TimeUnix\` AS ex_TimeUnix, + \`Exemplars.Value\` AS ex_Value, + \`Exemplars.TraceId\` AS ex_TraceId, + \`Exemplars.SpanId\` AS ex_SpanId`, + chSql`WHERE ${where.sql ? where : chSql`1 = 1`} AND notEmpty(ex_TraceId)`, + // Native exemplars carry no interestingness signal; keep the highest-value + // ones as a stable cap. ponytail: value-desc cap, revisit if even sampling + // across buckets is wanted. + chSql`ORDER BY value DESC LIMIT ${{ Int32: EXEMPLAR_QUERY_LIMIT }}`, + ]); +} + // EditForm -> translateToQueriedChartConfig -> QueriedChartConfig // renderFn(QueriedChartConfig) -> sql // query(sql) -> data diff --git a/packages/common-utils/src/types.ts b/packages/common-utils/src/types.ts index e1a2e79974..30dbf4fb40 100644 --- a/packages/common-utils/src/types.ts +++ b/packages/common-utils/src/types.ts @@ -1185,6 +1185,15 @@ const SharedChartSettingsSchema = z.object({ // number tiles have no time dimension to bucket). Other display types // ignore the field. Kept at shared level mirroring `color` / `colorRules`. backgroundChart: BackgroundChartSchema.optional(), + // Opt-in: overlay exemplar markers (individual traces linked to the series) + // on time charts. Sourced natively for metric/PromQL sources and generated + // per time bucket for trace sources. The UI gates the toggle on supported + // source kinds; off by default so the extra exemplar query never runs. + enableExemplars: z.boolean().optional(), + // Trace source that an exemplar's trace id resolves against — used to fetch + // hover metadata and to deep-link straight to the trace. When unset, the + // chart source's linked `traceSourceId` is used as a fallback. + exemplarTraceSourceId: z.string().optional(), }); export const _ChartConfigSchema = SharedChartSettingsSchema.extend({ @@ -1301,6 +1310,24 @@ export const ChartConfigSchema = z.union([ export type ChartConfig = z.infer; +/** + * A single exemplar: an individual data point overlaid on a time chart that + * links back to a trace. Shared shape for both backends — metric/PromQL + * sources surface native exemplars, trace sources generate them per bucket. + */ +export const ExemplarSchema = z.object({ + timestamp: z.number(), // epoch ms, x-position on the chart + value: z.number(), // exemplar's own value (metric Value / trace Duration in ms) + traceId: z.string(), + spanId: z.string().optional(), + // Matches the series the exemplar belongs to (LineData.displayName) so the + // overlay can attach markers to the right line. Undefined => applies to all. + groupKey: z.string().optional(), + attributes: z.record(z.string()).optional(), +}); + +export type Exemplar = z.infer; + export type DateRange = { dateRange: [Date, Date]; dateRangeStartInclusive?: boolean; // default true @@ -1604,6 +1631,9 @@ export const TeamClickHouseSettingsSchema = z.object({ metadataMaxRowsToRead: z.number().optional(), parallelizeWhenPossible: z.boolean().optional(), filterKeysFetchLimit: z.number().optional(), + // Target number of exemplar markers shown per chart (0 = unlimited). Not a + // ClickHouse setting, but lives in the same team-config bag for reuse. + maxExemplars: z.number().optional(), }); /** Accepts null to unset (reset to default) a setting. */ @@ -1614,6 +1644,7 @@ export const TeamClickHouseSettingsUpdateSchema = z.object({ metadataMaxRowsToRead: z.number().nullish(), parallelizeWhenPossible: z.boolean().nullish(), filterKeysFetchLimit: z.number().nullish(), + maxExemplars: z.number().nullish(), }); export type TeamClickHouseSettingsUpdate = z.infer< typeof TeamClickHouseSettingsUpdateSchema diff --git a/packages/otel-collector/builder-config.yaml b/packages/otel-collector/builder-config.yaml index 5b8e0c8f94..23ef4ee2d5 100644 --- a/packages/otel-collector/builder-config.yaml +++ b/packages/otel-collector/builder-config.yaml @@ -132,6 +132,9 @@ connectors: - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/connector/routingconnector v__OTEL_COLLECTOR_VERSION__ + - gomod: + github.com/open-telemetry/opentelemetry-collector-contrib/connector/spanmetricsconnector + v__OTEL_COLLECTOR_VERSION__ extensions: # Core diff --git a/scripts/dev-env.sh b/scripts/dev-env.sh index 938aaeb4d7..135eca7ef5 100755 --- a/scripts/dev-env.sh +++ b/scripts/dev-env.sh @@ -55,6 +55,7 @@ HDX_DEV_OTEL_GRPC_PORT=$((30800 + HDX_DEV_SLOT)) HDX_DEV_OTEL_HTTP_PORT=$((30900 + HDX_DEV_SLOT)) HDX_DEV_OTEL_METRICS_PORT=$((31000 + HDX_DEV_SLOT)) HDX_DEV_OTEL_JSON_HTTP_PORT=$((31100 + HDX_DEV_SLOT)) +HDX_DEV_PROMETHEUS_PORT=$((31200 + HDX_DEV_SLOT)) # --- Docker Compose project name (unique per slot) --- HDX_DEV_PROJECT="hdx-dev-${HDX_DEV_SLOT}" @@ -80,6 +81,7 @@ export HDX_DEV_OTEL_GRPC_PORT export HDX_DEV_OTEL_HTTP_PORT export HDX_DEV_OTEL_METRICS_PORT export HDX_DEV_OTEL_JSON_HTTP_PORT +export HDX_DEV_PROMETHEUS_PORT export HDX_DEV_PROJECT export NX_CACHE_DIRECTORY @@ -111,6 +113,7 @@ cat > "${HDX_DEV_SLOTS_DIR}/${HDX_DEV_SLOT}.json" < ClickHouse). Metric + * exemplars are written to ClickHouse directly because OTel JS (1.30) does not + * yet emit metric exemplars and our collector has no spanmetrics connector. + * See README for the spanmetrics upgrade path. + */ +'use strict'; + +const { + trace, + ROOT_CONTEXT, + SpanStatusCode, + SpanKind, + diag, + DiagConsoleLogger, + DiagLogLevel, +} = require('@opentelemetry/api'); +// Surface OTLP export failures (auth, connection) instead of dropping silently. +diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.ERROR); +const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node'); +const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base'); +const { + OTLPTraceExporter, +} = require('@opentelemetry/exporter-trace-otlp-proto'); +const { Resource } = require('@opentelemetry/resources'); +const { ATTR_SERVICE_NAME } = require('@opentelemetry/semantic-conventions'); +const net = require('net'); + +// ── Config ────────────────────────────────────────────────────────── +const cfg = { + // OTLP HTTP base (port 4318). Traces are POSTed to /v1/traces. + otlpEndpoint: + process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://otel-collector:4318', + backfillMinutes: Number(process.env.GEN_BACKFILL_MINUTES || 30), + ratePerSec: Number(process.env.GEN_RATE_PER_SEC || 20), + // HyperDX collectors enforce bearer-token auth on OTLP ingest. Default to the + // dev INGESTION_API_KEY (packages/api/.env.development); override per env. + otlpApiKey: + process.env.GEN_OTLP_API_KEY || 'super-secure-ingestion-api-key', +}; + +// ── Weighted / uniform random helpers ─────────────────────────────── +function pickWeighted(choices) { + const total = choices.reduce((s, c) => s + c[1], 0); + let r = Math.random() * total; + for (const [value, weight] of choices) { + r -= weight; + if (r <= 0) return value; + } + return choices[choices.length - 1][0]; +} +function pickUniform(arr) { + return arr[Math.floor(Math.random() * arr.length)]; +} +// Box-Muller gaussian, clamped to >= 1ms +function gaussianMs(mean, stddev) { + let u = 0; + let v = 0; + while (u === 0) u = Math.random(); + while (v === 0) v = Math.random(); + const z = Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v); + return Math.max(1, mean + z * stddev); +} + +// ── Attribute pools (cardinality + diversity) ─────────────────────── +const routes = [ + ['/api/users', 25], + ['/api/orders', 15], + ['/cart/checkout', 10], + ['/api/search', 25], + ['/api/products', 15], + ['/api/auth', 10], +]; +const methods = [ + ['GET', 60], + ['POST', 25], + ['PUT', 10], + ['DELETE', 5], +]; +const regions = [ + ['us-east-1', 40], + ['us-west-2', 30], + ['eu-west-1', 20], + ['ap-southeast-1', 10], +]; +const buildIDs = [ + ['build-7a1', 35], + ['build-7a2', 35], + ['build-7a3', 30], +]; +const platforms = [ + ['web', 50], + ['ios', 30], + ['android', 20], +]; +const featureFlags = [ + ['new-checkout-flow', 15], + ['dark-launch-search', 10], + ['legacy', 75], +]; +const tenants = [ + 'tenant-acme', + 'tenant-globex', + 'tenant-initech', + 'tenant-umbrella', +]; +const pods = Array.from({ length: 8 }, (_, i) => `pod-abc-${i + 1}`); +const userId = () => `user-${String(Math.floor(Math.random() * 500) + 1).padStart(4, '0')}`; + +function routeToService(route) { + switch (route) { + case '/api/orders': + case '/cart/checkout': + return 'order-service'; + case '/api/users': + case '/api/auth': + return 'user-service'; + case '/api/search': + case '/api/products': + return 'search-service'; + default: + return 'unknown-service'; + } +} +function serviceToDb(svc) { + if (svc === 'order-service' || svc === 'user-service') return 'postgres'; + if (svc === 'search-service') return 'elasticsearch'; + return 'none'; +} + +// ── Per-service tracer providers (ServiceName = resource attr) ─────── +const SERVICE_NAMES = [ + 'api-gateway', + 'order-service', + 'user-service', + 'search-service', + 'payment-service', + 'notification-service', +]; +const exporter = new OTLPTraceExporter({ + url: `${cfg.otlpEndpoint.replace(/\/$/, '')}/v1/traces`, + // HyperDX's bearertokenauth uses scheme:'' — send the raw token, no prefix. + headers: cfg.otlpApiKey ? { authorization: cfg.otlpApiKey } : undefined, +}); +const providers = new Map(); +const tracers = new Map(); +for (const name of SERVICE_NAMES) { + const provider = new NodeTracerProvider({ + resource: new Resource({ + [ATTR_SERVICE_NAME]: name, + 'service.version': '1.0.0', + }), + spanProcessors: [ + // Large queue so the backfill burst isn't dropped before export. + new BatchSpanProcessor(exporter, { + maxExportBatchSize: 512, + maxQueueSize: 32768, + scheduledDelayMillis: 500, + }), + ], + }); + providers.set(name, provider); + tracers.set(name, provider.getTracer('telemetry-generator')); +} +const tracer = name => tracers.get(name); + +const jitter = () => Math.random() * 2; // ms + +function commonAttrs(a) { + return { + 'http.method': a.method, + 'http.route': a.route, + 'user.id': a.uid, + 'app.tenant_id': a.tenant, + 'host.region': a.region, + 'app.build_id': a.buildID, + 'app.platform': a.platform, + 'app.feature_flag': a.featureFlag, + 'k8s.pod.name': a.pod, + }; +} +function markError(span, type, message) { + span.setStatus({ code: SpanStatusCode.ERROR, message }); + span.addEvent('exception', { + 'exception.type': type, + 'exception.message': message, + }); +} + +// ── Trace emission ────────────────────────────────────────────────── +// Emits a trace; the collector's spanmetrics connector derives the +// request-duration metric (with exemplars) from these spans. +function emitTrace(tsMs) { + const a = { + route: pickWeighted(routes), + method: pickWeighted(methods), + region: pickWeighted(regions), + buildID: pickWeighted(buildIDs), + platform: pickWeighted(platforms), + featureFlag: pickWeighted(featureFlags), + tenant: pickUniform(tenants), + uid: userId(), + pod: pickUniform(pods), + }; + const svc = routeToService(a.route); + const sc = detectScenario(a); + const plan = SCENARIOS[sc](a, svc); + + const root = tracer('api-gateway').startSpan( + `${a.method} ${a.route}`, + { + kind: SpanKind.SERVER, + startTime: tsMs, + attributes: { ...commonAttrs(a), 'http.status_code': plan.statusCode }, + }, + ROOT_CONTEXT, + ); + if (plan.error) markError(root, plan.error.type, plan.error.message); + else root.setStatus({ code: SpanStatusCode.OK }); + + const rootCtx = trace.setSpan(ROOT_CONTEXT, root); + const svcStart = tsMs + jitter(); + const svcSpan = tracer(svc).startSpan( + `${svc}.handle`, + { kind: SpanKind.INTERNAL, startTime: svcStart, attributes: svcAttrs(svc, a) }, + rootCtx, + ); + if (plan.error) markError(svcSpan, plan.error.type, plan.error.message); + const svcCtx = trace.setSpan(rootCtx, svcSpan); + + // Leaf spans (db / cache / downstream) per the scenario plan. + let cursor = svcStart + jitter(); + for (const leaf of plan.leaves) { + const leafSvc = leaf.service || svc; + const ls = tracer(leafSvc).startSpan( + leaf.name, + { + kind: SpanKind.CLIENT, + startTime: cursor, + attributes: { + [ATTR_SERVICE_NAME]: leafSvc, + 'host.region': a.region, + ...(leaf.attrs || {}), + }, + }, + svcCtx, + ); + if (leaf.error) markError(ls, leaf.error.type, leaf.error.message); + ls.end(cursor + leaf.durMs); + cursor += leaf.durMs + 1; + } + + svcSpan.end(svcStart + plan.svcDurMs); + root.end(tsMs + plan.rootDurMs); +} + +function svcAttrs(svc, a) { + return { + [ATTR_SERVICE_NAME]: svc, + 'http.route': a.route, + 'host.region': a.region, + 'app.build_id': a.buildID, + 'app.feature_flag': a.featureFlag, + 'app.platform': a.platform, + 'user.id': a.uid, + 'app.tenant_id': a.tenant, + 'k8s.pod.name': a.pod, + }; +} + +// ── Scenarios ─────────────────────────────────────────────────────── +// Each returns { rootDurMs, svcDurMs, statusCode, error?, leaves:[{name,durMs,service?,attrs?,error?}] } +const PAYMENT_TIMEOUT_RATE = 0.05; +const AUTH_LEAK_ERROR_RATE = 0.1; + +function detectScenario(a) { + const svc = routeToService(a.route); + if ( + a.route === '/cart/checkout' && + a.region === 'us-west-2' && + Math.random() < PAYMENT_TIMEOUT_RATE + ) + return 'paymentTimeout'; + if ( + a.route === '/cart/checkout' && + a.featureFlag === 'new-checkout-flow' && + a.region === 'eu-west-1' + ) + return 'slowCheckout'; + if (a.route === '/api/orders' && a.platform === 'ios' && a.buildID === 'build-7a3') + return 'iosOrderErrors'; + if ( + a.tenant === 'tenant-initech' && + a.featureFlag === 'dark-launch-search' && + a.route === '/api/search' + ) + return 'initechSearch'; + if ( + a.route === '/api/auth' && + a.buildID === 'build-7a3' && + (a.pod === 'pod-abc-7' || a.pod === 'pod-abc-8') + ) + return 'authMemoryLeak'; + if (a.region === 'ap-southeast-1' && svc === 'user-service') + return 'redisTimeoutApac'; + if (a.tenant === 'tenant-umbrella' && a.region === 'eu-west-1') + return 'umbrellaCompliance'; + return 'normal'; +} + +function normalStatus() { + const r = Math.random(); + if (r < 0.95) return 200; + if (r < 0.98) return 201; + return 404; +} + +const SCENARIOS = { + normal: (a, svc) => { + const db = serviceToDb(svc); + const leaves = []; + if (db === 'postgres') + leaves.push({ + name: 'postgres.query', + durMs: gaussianMs(10, 5), + attrs: { 'db.system': 'postgres', 'db.statement': `SELECT * FROM ${a.route.slice(5)}` }, + }); + else if (db === 'elasticsearch') + leaves.push({ + name: 'elasticsearch.search', + durMs: gaussianMs(10, 5), + attrs: { 'db.system': 'elasticsearch' }, + }); + if (a.route === '/cart/checkout') + leaves.push({ + name: 'payment-service.charge', + service: 'payment-service', + durMs: gaussianMs(10, 4), + }); + return { rootDurMs: gaussianMs(40, 20), svcDurMs: gaussianMs(25, 12), statusCode: normalStatus(), leaves }; + }, + slowCheckout: a => { + const leaves = []; + const n = 3 + Math.floor(Math.random() * 3); + for (let i = 0; i < n; i++) + leaves.push({ + name: 'postgres.query', + durMs: gaussianMs(200, 40), + attrs: { 'db.system': 'postgres', 'db.statement': 'SELECT * FROM orders WHERE id = ?' }, + }); + leaves.push({ name: 'payment-service.charge', service: 'payment-service', durMs: gaussianMs(200, 50) }); + return { rootDurMs: gaussianMs(1500, 400), svcDurMs: gaussianMs(1200, 350), statusCode: 200, leaves }; + }, + iosOrderErrors: () => ({ + rootDurMs: gaussianMs(250, 60), + svcDurMs: gaussianMs(100, 30), + statusCode: 500, + error: { type: 'ValidationError', message: 'malformed request body' }, + leaves: [], + }), + redisTimeoutApac: a => ({ + rootDurMs: gaussianMs(650, 120), + svcDurMs: gaussianMs(580, 100), + statusCode: 200, + leaves: [ + { name: 'redis.get', durMs: gaussianMs(550, 100), attrs: { 'db.system': 'redis', 'db.statement': `GET user:session:${a.uid}` } }, + { name: 'postgres.query', durMs: gaussianMs(30, 10), attrs: { 'db.system': 'postgres' } }, + ], + }), + initechSearch: () => ({ + rootDurMs: gaussianMs(3000, 500), + svcDurMs: gaussianMs(2800, 450), + statusCode: 500, + error: { type: 'TimeoutError', message: 'elasticsearch timeout' }, + leaves: [ + { + name: 'elasticsearch.search', + durMs: gaussianMs(2500, 400), + attrs: { 'db.system': 'elasticsearch' }, + error: { type: 'IOException', message: 'read tcp: i/o timeout' }, + }, + ], + }), + authMemoryLeak: a => { + const err = Math.random() < AUTH_LEAK_ERROR_RATE; + return { + rootDurMs: gaussianMs(800, 200), + svcDurMs: gaussianMs(700, 180), + statusCode: err ? 503 : 200, + error: err ? { type: 'ServiceUnavailableError', message: 'GC overhead' } : undefined, + leaves: [{ name: 'redis.get', durMs: gaussianMs(600, 150), attrs: { 'db.system': 'redis', 'db.statement': `GET auth:token:${a.uid}` } }], + }; + }, + paymentTimeout: () => ({ + rootDurMs: gaussianMs(5000, 500), + svcDurMs: gaussianMs(4800, 450), + statusCode: 504, + error: { type: 'TimeoutError', message: 'gateway timeout' }, + leaves: [ + { name: 'postgres.query', durMs: gaussianMs(20, 8), attrs: { 'db.system': 'postgres' } }, + { + name: 'payment-service.charge', + service: 'payment-service', + durMs: gaussianMs(4500, 300), + error: { type: 'TimeoutError', message: 'context deadline exceeded' }, + }, + ], + }), + umbrellaCompliance: (a, svc) => { + const overhead = gaussianMs(150, 40); + const db = serviceToDb(svc); + const leaves = [{ name: 'compliance.data_residency_check', durMs: overhead, attrs: { 'app.tenant_id': a.tenant } }]; + if (db === 'postgres') leaves.push({ name: 'postgres.query', durMs: gaussianMs(10, 5), attrs: { 'db.system': 'postgres' } }); + else if (db === 'elasticsearch') leaves.push({ name: 'elasticsearch.search', durMs: gaussianMs(10, 5), attrs: { 'db.system': 'elasticsearch' } }); + return { rootDurMs: gaussianMs(40, 20) + overhead, svcDurMs: gaussianMs(40, 20) + overhead, statusCode: normalStatus(), leaves }; + }, +}; + +// The collector's OTLP receiver is configured dynamically via OpAMP, so the +// port isn't open the instant the container starts — it appears only after the +// collector fetches its remote config. Wait for it before backfilling, else the +// initial burst hits ECONNREFUSED and is lost. +function waitForCollector(timeoutMs = 120_000) { + const u = new URL(cfg.otlpEndpoint); + const port = Number(u.port) || (u.protocol === 'https:' ? 443 : 80); + const host = u.hostname; + const deadline = Date.now() + timeoutMs; + const tryOnce = () => + new Promise(resolve => { + const sock = net.connect({ host, port }, () => { + sock.destroy(); + resolve(true); + }); + sock.on('error', () => { + sock.destroy(); + resolve(false); + }); + sock.setTimeout(2000, () => { + sock.destroy(); + resolve(false); + }); + }); + return (async () => { + while (Date.now() < deadline) { + if (await tryOnce()) return true; + await new Promise(r => setTimeout(r, 2000)); + } + return false; + })(); +} + +// ── Main ──────────────────────────────────────────────────────────── +async function main() { + console.log( + `telemetry-generator starting: otlp=${cfg.otlpEndpoint} ` + + `backfill=${cfg.backfillMinutes}m rate=${cfg.ratePerSec}/s`, + ); + + console.log('waiting for collector OTLP endpoint...'); + const ready = await waitForCollector(); + console.log(ready ? 'collector reachable' : 'collector wait timed out, proceeding anyway'); + + // Backfill historical traces. The spanmetrics connector derives the + // request-duration metric (with exemplars) from these spans. + const now = Date.now(); + const backfillStart = now - cfg.backfillMinutes * 60_000; + const backfillCount = cfg.backfillMinutes * 60 * cfg.ratePerSec; + console.log(`backfilling ~${backfillCount} requests over ${cfg.backfillMinutes}m...`); + for (let i = 0; i < backfillCount; i++) { + const ts = backfillStart + Math.random() * (now - backfillStart); + emitTrace(ts); + // Periodically yield so the span exporter can drain its queue instead of + // overflowing (which would drop traces, and thus their exemplars). + if (i % 500 === 0) await new Promise(r => setTimeout(r, 25)); + } + // Let the final span batches export before we declare backfill done. + await new Promise(r => setTimeout(r, 3000)); + console.log('backfill complete, starting live emission...'); + + // Live emission. + const intervalMs = Math.max(1, Math.floor(1000 / cfg.ratePerSec)); + setInterval(() => emitTrace(Date.now()), intervalMs); + + const shutdown = async () => { + console.log('shutting down...'); + await Promise.all( + [...providers.values()].map(p => p.shutdown().catch(() => {})), + ); + process.exit(0); + }; + process.on('SIGINT', shutdown); + process.on('SIGTERM', shutdown); +} + +main().catch(e => { + console.error(e); + process.exit(1); +});