Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .changeset/exemplar-mode-metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
"@hyperdx/common-utils": minor
"@hyperdx/api": minor
"@hyperdx/app": minor
---

feat: add exemplar overlay for metric and PromQL charts

Time charts on metric and PromQL sources can now overlay exemplars —
individual data points linked to a trace — via the "Exemplars" toggle in the
chart editor (next to "As Ratio" for metric charts, in the PromQL editor for
PromQL charts). Markers snap onto the series line so the chart stays honest;
hovering a marker shows trace metadata (service, span, duration, status) from a
configurable exemplar trace source, with a button to open the trace directly.

For structured metric sources, exemplars are read directly from the OTel metric
tables' `Exemplars.*` columns (`renderMetricExemplarsChartConfig`), honoring the
chart's time range, metric name, and filters. For PromQL sources backed by a
real Prometheus endpoint, the new `/v1/prometheus/query_exemplars` route proxies
to Prometheus's native `/api/v1/query_exemplars`. The overlay is opt-in and runs
its query in parallel only when enabled, so charts that don't use it are
unaffected. Trace-source exemplar generation lands in a follow-up.
15 changes: 15 additions & 0 deletions .changeset/span-metrics-connector.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
"@hyperdx/api": minor
"@hyperdx/otel-collector": minor
---

feat: optional spanmetrics connector for metric exemplars

Adds the `spanmetricsconnector` to the collector build and wires it into the
OpAMP-generated collector config, gated on the `ENABLE_SPAN_METRICS` env flag
(off by default). When enabled, the collector derives `traces.span.metrics.*`
(calls + duration histogram) from spans with **exemplars enabled**, so the
duration histogram lands in ClickHouse with `Exemplars.*` pointing back at the
spans they were measured from — giving coherent, fully-OTLP metric exemplars
without any direct ClickHouse writes. Enabled in local dev to back the new
`telemetry-generator` service.
33 changes: 31 additions & 2 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ services:
OTEL_SUPERVISOR_LOGS: 'true'
HYPERDX_OTEL_EXPORTER_TABLES_TTL: '24h'
ENABLE_PROMQL: 'true'
# Remote-write the span-derived metrics (with exemplars) to the dev
# Prometheus so the native query_exemplars path is testable with real data.
SPAN_METRICS_PROM_RW_ENDPOINT: 'http://prometheus:9090/api/v1/write'
volumes:
- ./docker/otel-collector/config.yaml:/etc/otelcol-contrib/config.yaml
- ./docker/otel-collector/supervisor_docker.yaml.tmpl:/etc/otel/supervisor.yaml.tmpl
Expand Down Expand Up @@ -157,9 +160,15 @@ services:
hdx.dev.service: prometheus
hdx.dev.port: '${HDX_DEV_PROMETHEUS_PORT:-9090}'
hdx.dev.url: 'http://localhost:${HDX_DEV_PROMETHEUS_PORT:-9090}'
profiles:
- prometheus
image: prom/prometheus:latest
# exemplar-storage: query_exemplars data; remote-write receiver: ingest the
# collector's span-derived metrics (with exemplars) from real generated data.
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--enable-feature=exemplar-storage'
- '--web.enable-remote-write-receiver'
- '--web.enable-lifecycle'
ports:
- '${HDX_DEV_PROMETHEUS_PORT:-9090}:9090'
volumes:
Expand All @@ -169,5 +178,25 @@ services:
- internal
restart: on-failure

# Synthetic traces (via OTLP) + coherent metric exemplars (to ClickHouse) for
# local dev + e2e. See telemetry-generator/README.md.
telemetry-generator:
labels:
<<: *hdx-labels
hdx.dev.service: telemetry-generator
build:
context: ./telemetry-generator
environment:
OTEL_EXPORTER_OTLP_ENDPOINT: 'http://otel-collector:4318'
GEN_OTLP_API_KEY: '${INGESTION_API_KEY:-super-secure-ingestion-api-key}'
GEN_BACKFILL_MINUTES: '30'
GEN_RATE_PER_SEC: '20'
networks:
- internal
restart: on-failure
depends_on:
otel-collector:
condition: service_started

networks:
internal:
4 changes: 4 additions & 0 deletions docker/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ scrape_configs:
static_configs:
- targets: ['ch-server:9363']
metrics_path: '/metrics'

# Note: the span-derived request metrics (with exemplars) arrive via
# remote-write from the OTel collector, not a scrape — see the collector's
# SPAN_METRICS_PROM_RW_ENDPOINT and the spanmetrics connector.
3 changes: 2 additions & 1 deletion knip.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
"ignore": [
"scripts/dev-portal/**",
".github/scripts/**",
"docker/hyperdx/**"
"docker/hyperdx/**",
"telemetry-generator/**"
],
"ignoreBinaries": ["make", "migrate", "playwright"],
"ignoreDependencies": [
Expand Down
6 changes: 4 additions & 2 deletions packages/api/.env.development
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ REDIS_URL=redis://localhost:6379
USAGE_STATS_ENABLED=false
NODE_OPTIONS="--max-http-header-size=131072"
ENABLE_SWAGGER=true
DEFAULT_CONNECTIONS=[{"name":"Local ClickHouse","host":"http://localhost:${HDX_DEV_CH_HTTP_PORT}","username":"default","password":""}]
DEFAULT_SOURCES=[{"from":{"databaseName":"default","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","metadataMaterializedViews":{"keyRollupTable":"otel_logs_key_rollup_15m","kvRollupTable":"otel_logs_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","traceSourceId":"Traces","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","metadataMaterializedViews":{"keyRollupTable":"otel_traces_key_rollup_15m","kvRollupTable":"otel_traces_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","logSourceId":"Logs","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum","_id":"682586a8b1f81924e628e808","id":"682586a8b1f81924e628e808"},"connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","sessionSourceId":"Sessions"},{"from":{"databaseName":"default","tableName":"hyperdx_sessions"},"kind":"session","timestampValueExpression":"TimestampTime","name":"Sessions","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","metricSourceId":"Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"JSON Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","traceSourceId":"JSON Traces","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"JSON Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","connection":"Local ClickHouse","logSourceId":"JSON Logs","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"JSON Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum"},"connection":"Local ClickHouse","logSourceId":"JSON Logs","traceSourceId":"JSON Traces"}]
DEFAULT_CONNECTIONS=[{"name":"Local ClickHouse","host":"http://localhost:${HDX_DEV_CH_HTTP_PORT}","username":"default","password":""},{"name":"Local Prometheus","host":"http://localhost:${HDX_DEV_PROMETHEUS_PORT}","username":"","password":"","isPrometheusEndpoint":true}]
DEFAULT_SOURCES=[{"from":{"databaseName":"default","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","metadataMaterializedViews":{"keyRollupTable":"otel_logs_key_rollup_15m","kvRollupTable":"otel_logs_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","traceSourceId":"Traces","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","metadataMaterializedViews":{"keyRollupTable":"otel_traces_key_rollup_15m","kvRollupTable":"otel_traces_kv_rollup_15m","granularity":"15 minute"},"connection":"Local ClickHouse","logSourceId":"Logs","sessionSourceId":"Sessions","metricSourceId":"Metrics"},{"from":{"databaseName":"default","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum","_id":"682586a8b1f81924e628e808","id":"682586a8b1f81924e628e808"},"connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","sessionSourceId":"Sessions"},{"from":{"databaseName":"default","tableName":"hyperdx_sessions"},"kind":"session","timestampValueExpression":"TimestampTime","name":"Sessions","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","logSourceId":"Logs","traceSourceId":"Traces","metricSourceId":"Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_logs"},"kind":"log","timestampValueExpression":"Timestamp","name":"JSON Logs","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"Body","serviceNameExpression":"ServiceName","bodyExpression":"Body","eventAttributesExpression":"LogAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,SeverityText,Body","severityTextExpression":"SeverityText","traceIdExpression":"TraceId","spanIdExpression":"SpanId","connection":"Local ClickHouse","traceSourceId":"JSON Traces","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":"otel_traces"},"kind":"trace","timestampValueExpression":"Timestamp","name":"JSON Traces","displayedTimestampValueExpression":"Timestamp","implicitColumnExpression":"SpanName","serviceNameExpression":"ServiceName","eventAttributesExpression":"SpanAttributes","resourceAttributesExpression":"ResourceAttributes","defaultTableSelectExpression":"Timestamp,ServiceName,StatusCode,round(Duration/1e6),SpanName","traceIdExpression":"TraceId","spanIdExpression":"SpanId","durationExpression":"Duration","durationPrecision":9,"parentSpanIdExpression":"ParentSpanId","spanNameExpression":"SpanName","spanKindExpression":"SpanKind","statusCodeExpression":"StatusCode","statusMessageExpression":"StatusMessage","connection":"Local ClickHouse","logSourceId":"JSON Logs","metricSourceId":"JSON Metrics"},{"from":{"databaseName":"otel_json","tableName":""},"kind":"metric","timestampValueExpression":"TimeUnix","name":"JSON Metrics","resourceAttributesExpression":"ResourceAttributes","metricTables":{"gauge":"otel_metrics_gauge","histogram":"otel_metrics_histogram","sum":"otel_metrics_sum"},"connection":"Local ClickHouse","logSourceId":"JSON Logs","traceSourceId":"JSON Traces"},{"from":{"databaseName":"prometheus","tableName":"prometheus"},"kind":"promql","timestampValueExpression":"timestamp","name":"Prometheus","connection":"Local Prometheus","traceSourceId":"Traces"}]
INGESTION_API_KEY="super-secure-ingestion-api-key"
HYPERDX_API_KEY=$INGESTION_API_KEY
ANTHROPIC_API_KEY="your-anthropic-api-key-here"
ENABLE_PROMQL=true
ENABLE_SPAN_METRICS=true
ENABLE_SPAN_METRICS_PROM_RW=true
14 changes: 14 additions & 0 deletions packages/api/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ export const DEFAULT_SOURCES = env.DEFAULT_SOURCES;

export const IS_PROMQL_ENABLED = env.ENABLE_PROMQL === 'true';

// Opt-in: have the collector derive request metrics (with trace exemplars) from
// spans via the spanmetrics connector. Off by default; enabled in dev so the
// telemetry-generator's traces produce coherent metric exemplars end-to-end.
export const IS_SPAN_METRICS_ENABLED = env.ENABLE_SPAN_METRICS === 'true';

// Opt-in: also remote-write the span-derived metrics (with exemplars) to a
// Prometheus endpoint (SPAN_METRICS_PROM_RW_ENDPOINT on the collector) so the
// native Prometheus query_exemplars path can be tested against real data.
// Requires the endpoint to be set; without it the generated collector config
// would fail to resolve ${env:SPAN_METRICS_PROM_RW_ENDPOINT} and not start.
export const IS_SPAN_METRICS_PROM_RW_ENABLED =
env.ENABLE_SPAN_METRICS_PROM_RW === 'true' &&
!!env.SPAN_METRICS_PROM_RW_ENDPOINT;
Comment thread
greptile-apps[bot] marked this conversation as resolved.
Outdated

// FOR CI ONLY
export const CLICKHOUSE_HOST = env.CLICKHOUSE_HOST as string;
export const CLICKHOUSE_USER = env.CLICKHOUSE_USER as string;
Expand Down
1 change: 1 addition & 0 deletions packages/api/src/models/team.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export default mongoose.model<ITeam>(
fieldMetadataDisabled: Boolean,
parallelizeWhenPossible: Boolean,
filterKeysFetchLimit: Number,
maxExemplars: Number,
},
{
timestamps: true,
Expand Down
78 changes: 78 additions & 0 deletions packages/api/src/opamp/controllers/opampController.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ type CollectorConfig = {
pipelines: string[];
}>;
};
span_metrics?: {
histogram: { unit: string; explicit: { buckets: string[] } };
dimensions: Array<{ name: string }>;
exemplars: { enabled: boolean };
metrics_flush_interval: string;
namespace?: string;
};
};
exporters?: {
nop?: null;
Expand Down Expand Up @@ -132,6 +139,15 @@ type CollectorConfig = {
enabled: boolean;
};
};
'prometheusremotewrite/spanmetrics'?: {
endpoint: string;
tls: {
insecure: boolean;
};
resource_to_telemetry_conversion: {
enabled: boolean;
};
};
};
service: {
extensions: string[];
Expand Down Expand Up @@ -322,6 +338,68 @@ export const buildOtelCollectorConfig = (
};
}

if (
config.IS_SPAN_METRICS_ENABLED &&
otelCollectorConfig.connectors &&
otelCollectorConfig.exporters
) {
// Derive request metrics (with trace exemplars) from spans. The connector
// consumes the traces pipeline and feeds a dedicated metrics pipeline, so
// the resulting `traces.span.metrics.*` land in ClickHouse with
// `Exemplars.*` pointing back at the spans they were measured from.
otelCollectorConfig.connectors.span_metrics = {
histogram: {
unit: 'ms',
explicit: {
buckets: [
'2ms',
'5ms',
'10ms',
'25ms',
'50ms',
'100ms',
'250ms',
'500ms',
'1s',
'2.5s',
'5s',
'10s',
],
},
},
dimensions: [
{ name: 'http.route' },
{ name: 'http.method' },
{ name: 'host.region' },
{ name: 'app.tenant_id' },
{ name: 'http.status_code' },
],
exemplars: { enabled: true },
metrics_flush_interval: '15s',
};
otelCollectorConfig.service.pipelines.traces.exporters.push(
'span_metrics',
);

const spanMetricsExporters = ['clickhouse'];
// Optionally also remote-write the derived metrics (with exemplars) to a
// Prometheus endpoint, so the native Prometheus `query_exemplars` path can
// be exercised against the same real, generated data.
if (config.IS_SPAN_METRICS_PROM_RW_ENABLED) {
otelCollectorConfig.exporters['prometheusremotewrite/spanmetrics'] = {
endpoint: '${env:SPAN_METRICS_PROM_RW_ENDPOINT}',
Comment thread
greptile-apps[bot] marked this conversation as resolved.
Outdated
tls: { insecure: true },
resource_to_telemetry_conversion: { enabled: true },
};
spanMetricsExporters.push('prometheusremotewrite/spanmetrics');
}
otelCollectorConfig.service.pipelines['metrics/spanmetrics'] = {
receivers: ['span_metrics'],
processors: ['memory_limiter', 'batch'],
exporters: spanMetricsExporters,
};
}

if (collectorAuthenticationEnforced) {
if (otelCollectorConfig.receivers['otlp/hyperdx'] == null) {
// should never happen
Expand Down
Loading
Loading