diff --git a/.changeset/fix-alert-history-lookback.md b/.changeset/fix-alert-history-lookback.md new file mode 100644 index 0000000000..7f691a7bed --- /dev/null +++ b/.changeset/fix-alert-history-lookback.md @@ -0,0 +1,7 @@ +--- +"@hyperdx/api": patch +--- + +fix(api): bound alert history lookback by check interval + +`getPreviousAlertHistories` scanned a fixed 7-day window per alert per tick. Size the narrow lookback from each alert's interval (with a floor and 7-day fallback when empty) to cut MongoDB index scans on the hot path. diff --git a/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts b/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts index 16c0726065..ea89a325d9 100644 --- a/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts +++ b/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts @@ -1,6 +1,7 @@ import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node'; import { AlertErrorType, + AlertInterval, AlertState, AlertThresholdType, SourceKind, @@ -36,6 +37,7 @@ import Webhook, { IWebhook } from '@/models/webhook'; import * as checkAlert from '@/tasks/checkAlerts'; import { alertHasGroupBy, + computeAlertHistoryLookbackMs, doesExceedThreshold, getPreviousAlertHistories, getScheduledWindowStart, @@ -2115,7 +2117,7 @@ describe('checkAlerts', () => { teamWebhooksById: Map, ) => { const previousMap = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], now, ); await processAlert( @@ -6206,7 +6208,7 @@ describe('checkAlerts', () => { // Should NOT rescan 22:00-22:05 where service-b had data but was already checked const nextRun = new Date('2023-11-16T22:23:00.000Z'); const previousMapNextRun = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], nextRun, ); @@ -8746,7 +8748,7 @@ describe('checkAlerts', () => { // Act - Run alerts twice to cover two periods let previousMap = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], now, ); await processAlert( @@ -8763,7 +8765,7 @@ describe('checkAlerts', () => { const nextWindow = new Date('2023-11-16T22:15:00.000Z'); previousMap = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], nextWindow, ); await processAlert( @@ -8890,7 +8892,7 @@ describe('checkAlerts', () => { // Act - Run alerts twice to cover two periods let previousMap = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], now, ); await processAlert( @@ -8907,7 +8909,7 @@ describe('checkAlerts', () => { const nextWindow = new Date('2023-11-16T22:15:00.000Z'); previousMap = await getPreviousAlertHistories( - [details.alert.id], + [{ id: details.alert.id, interval: details.alert.interval }], nextWindow, ); await processAlert( @@ -8969,6 +8971,44 @@ describe('checkAlerts', () => { }).save(); }; + const toLookups = (ids: string[], interval: AlertInterval = '5m') => + ids.map(id => ({ id, interval })); + + it('computes interval-based lookback with a floor and max cap', () => { + expect(computeAlertHistoryLookbackMs('1m')).toBe(ms('1h')); + expect(computeAlertHistoryLookbackMs('5m')).toBe(ms('1h')); + expect(computeAlertHistoryLookbackMs('1h')).toBe(ms('5h')); + expect(computeAlertHistoryLookbackMs('1d')).toBe(ms('5d')); + }); + + it('does not run the wide fallback for alerts with no history at all', async () => { + const alertId = new mongoose.Types.ObjectId(); + const aggregateSpy = jest.spyOn(AlertHistory, 'aggregate'); + + const result = await getPreviousAlertHistories( + [{ id: alertId.toString(), interval: '1m' }], + new Date('2025-01-10T00:00:00Z'), + ); + + expect(result.get(alertId.toString())).toBeUndefined(); + expect(aggregateSpy).toHaveBeenCalledTimes(1); + }); + + it('falls back to the max lookback when narrow window has no history', async () => { + const alertId = new mongoose.Types.ObjectId(); + const now = new Date('2025-01-10T00:00:00Z'); + await saveAlert(alertId, new Date('2025-01-07T12:00:00Z')); + + const result = await getPreviousAlertHistories( + [{ id: alertId.toString(), interval: '1m' }], + now, + ); + + expect(result.get(alertId.toString())!.createdAt).toEqual( + new Date('2025-01-07T12:00:00Z'), + ); + }); + it('should return the latest alert history for each alert', async () => { const alert1Id = new mongoose.Types.ObjectId(); await saveAlert(alert1Id, new Date('2025-01-01T00:00:00Z')); @@ -8979,7 +9019,7 @@ describe('checkAlerts', () => { await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z')); const result = await getPreviousAlertHistories( - [alert1Id.toString(), alert2Id.toString()], + toLookups([alert1Id.toString(), alert2Id.toString()]), new Date('2025-01-01T00:20:00Z'), ); @@ -9002,7 +9042,7 @@ describe('checkAlerts', () => { await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z')); // This one is in the future const result = await getPreviousAlertHistories( - [alert1Id.toString(), alert2Id.toString()], + toLookups([alert1Id.toString(), alert2Id.toString()]), new Date('2025-01-01T00:14:00Z'), ); @@ -9023,7 +9063,7 @@ describe('checkAlerts', () => { const alert2Id = new mongoose.Types.ObjectId(); const result = await getPreviousAlertHistories( - [alert1Id.toString(), alert2Id.toString()], + toLookups([alert1Id.toString(), alert2Id.toString()]), new Date('2025-01-01T00:20:00Z'), ); @@ -9044,7 +9084,7 @@ describe('checkAlerts', () => { await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z')); const result = await getPreviousAlertHistories( - [alert1Id.toString()], + toLookups([alert1Id.toString()]), new Date('2025-01-01T00:20:00Z'), ); @@ -9074,13 +9114,15 @@ describe('checkAlerts', () => { alert2Id.toString(), ]; + const allLookups = toLookups(allIds); + const result = await getPreviousAlertHistories( - allIds, + allLookups, new Date('2025-01-01T00:20:00Z'), ); - // One aggregation per alert ID (no chunking) - expect(aggregateSpy).toHaveBeenCalledTimes(allIds.length); + // Alerts with recent history: one narrow query. Alerts with none: one narrow query only. + expect(aggregateSpy).toHaveBeenCalledTimes(2 + 150); expect(result.size).toBe(2); expect(result.get(alert1Id.toString())!.createdAt).toEqual( new Date('2025-01-01T00:05:00Z'), diff --git a/packages/api/src/tasks/checkAlerts/index.ts b/packages/api/src/tasks/checkAlerts/index.ts index 2209bf7139..504f344f99 100644 --- a/packages/api/src/tasks/checkAlerts/index.ts +++ b/packages/api/src/tasks/checkAlerts/index.ts @@ -32,7 +32,9 @@ import { isRawSqlSavedChartConfig, } from '@hyperdx/common-utils/dist/guards'; import { + ALERT_INTERVAL_TO_MINUTES, AlertErrorType, + AlertInterval, AlertThresholdType, BuilderChartConfigWithOptDateRange, ChartConfigWithOptDateRange, @@ -1244,69 +1246,124 @@ export interface AggregatedAlertHistory { group?: string; } +export type AlertHistoryLookup = { + id: string; + interval: AlertInterval; +}; + +/** Number of check intervals to include in the narrow history lookback window. */ +export const ALERT_HISTORY_LOOKBACK_INTERVAL_MULTIPLIER = 5; +/** Minimum narrow lookback regardless of alert interval. */ +export const ALERT_HISTORY_LOOKBACK_FLOOR_MS = ms('1h'); +/** Maximum lookback used when falling back from the narrow window. */ +export const ALERT_HISTORY_LOOKBACK_MAX_MS = ms('7d'); + +export function computeAlertHistoryLookbackMs(interval: AlertInterval): number { + const intervalMs = ALERT_INTERVAL_TO_MINUTES[interval] * 60 * 1000; + return Math.min( + ALERT_HISTORY_LOOKBACK_MAX_MS, + Math.max( + ALERT_HISTORY_LOOKBACK_FLOOR_MS, + intervalMs * ALERT_HISTORY_LOOKBACK_INTERVAL_MULTIPLIER, + ), + ); +} + +async function aggregatePreviousAlertHistoriesForAlert( + alertId: mongoose.Types.ObjectId, + now: Date, + lookbackMs: number, +): Promise { + const lookbackDate = new Date(now.getTime() - lookbackMs); + + return AlertHistory.aggregate([ + { + $match: { + alert: alertId, + createdAt: { $lte: now, $gte: lookbackDate }, + }, + }, + // With a single alert value, the compound index {alert: 1, group: 1, createdAt: -1} + // delivers results already in this sort order — this is an index-backed no-op sort. + { + $sort: { alert: 1, group: 1, createdAt: -1 }, + }, + // Group by {alert, group}, taking the first (latest) document's fields. + // Using $first on individual fields instead of $first: '$$ROOT' allows + // DocumentDB to avoid fetching full documents when not needed. + { + $group: { + _id: { + alert: '$alert', + group: '$group', + }, + createdAt: { $first: '$createdAt' }, + state: { $first: '$state' }, + }, + }, + { + $project: { + _id: '$_id.alert', + createdAt: 1, + state: 1, + group: '$_id.group', + }, + }, + ]); +} + /** - * Fetch the most recent AlertHistory value for each of the given alert IDs. + * Fetch the most recent AlertHistory value for each of the given alerts. * For group-by alerts, returns the latest history for each group within each alert. * * Uses per-alert queries instead of batched $in to leverage the compound index - * {alert: 1, group: 1, createdAt: -1} for index-backed sorting. With a single - * alert value, the index delivers results already sorted by {group, createdAt desc}, - * so the $sort is a no-op and $group + $first can short-circuit per group. + * {alert: 1, group: 1, createdAt: -1} for index-backed sorting. The lookback + * window is sized from each alert's check interval (with a floor and a 7-day + * fallback when the narrow window returns no rows but older history exists). * - * @param alertIds The list of alert IDs to query the latest history for. + * @param alerts Alert IDs and intervals to query the latest history for. * @param now The current date and time. AlertHistory documents that have a createdAt > now are ignored. * @returns A map from Alert IDs (or Alert ID + group) to their most recent AlertHistory. * For non-grouped alerts, the key is just the alert ID. * For grouped alerts, the key is "alertId||group" to track per-group state. */ export const getPreviousAlertHistories = async ( - alertIds: string[], + alerts: AlertHistoryLookup[], now: Date, ) => { - const lookbackDate = new Date(now.getTime() - ms('7d')); - // Use a concurrency-limited queue to avoid overwhelming the connection pool // when there are many alerts (e.g., 200+ alert IDs). const queue = new PQueue({ concurrency: ALERT_HISTORY_QUERY_CONCURRENCY }); const results = await Promise.all( - alertIds.map(alertId => + alerts.map(({ id, interval }) => queue.add(async () => { - const id = new mongoose.Types.ObjectId(alertId); - return AlertHistory.aggregate([ - { - $match: { - alert: id, - createdAt: { $lte: now, $gte: lookbackDate }, - }, - }, - // With a single alert value, the compound index {alert: 1, group: 1, createdAt: -1} - // delivers results already in this sort order — this is an index-backed no-op sort. - { - $sort: { alert: 1, group: 1, createdAt: -1 }, - }, - // Group by {alert, group}, taking the first (latest) document's fields. - // Using $first on individual fields instead of $first: '$$ROOT' allows - // DocumentDB to avoid fetching full documents when not needed. - { - $group: { - _id: { - alert: '$alert', - group: '$group', - }, - createdAt: { $first: '$createdAt' }, - state: { $first: '$state' }, - }, - }, - { - $project: { - _id: '$_id.alert', - createdAt: 1, - state: 1, - group: '$_id.group', - }, - }, - ]); + const alertId = new mongoose.Types.ObjectId(id); + const narrowLookbackMs = computeAlertHistoryLookbackMs(interval); + let histories = await aggregatePreviousAlertHistoriesForAlert( + alertId, + now, + narrowLookbackMs, + ); + + if ( + histories.length === 0 && + narrowLookbackMs < ALERT_HISTORY_LOOKBACK_MAX_MS + ) { + const hasAnyHistory = await AlertHistory.exists({ + alert: alertId, + createdAt: { $lte: now }, + }); + if (hasAnyHistory) { + histories = await aggregatePreviousAlertHistoriesForAlert( + alertId, + now, + ALERT_HISTORY_LOOKBACK_MAX_MS, + ); + } + } + + return histories; }), ), ); diff --git a/packages/api/src/tasks/checkAlerts/providers/default.ts b/packages/api/src/tasks/checkAlerts/providers/default.ts index 7a255bff76..221b9fb241 100644 --- a/packages/api/src/tasks/checkAlerts/providers/default.ts +++ b/packages/api/src/tasks/checkAlerts/providers/default.ts @@ -275,8 +275,10 @@ export default class DefaultAlertProvider implements AlertProvider { const alerts = await Alert.find({}); const now = new Date(); - const alertIds = alerts.map(({ id }) => id); - const previousAlerts = await getPreviousAlertHistories(alertIds, now); + const previousAlerts = await getPreviousAlertHistories( + alerts.map(alert => ({ id: alert.id, interval: alert.interval })), + now, + ); for (const alert of alerts) { try {