Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/fix-alert-history-lookback.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@hyperdx/api": patch
---

fix(api): bound alert history lookback by check interval

`getPreviousAlertHistories` scanned a fixed 7-day window per alert per tick. Size the narrow lookback from each alert's interval (with a floor and 7-day fallback when empty) to cut MongoDB index scans on the hot path.
70 changes: 57 additions & 13 deletions packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node';
import {
AlertErrorType,
AlertInterval,
AlertState,
AlertThresholdType,
SourceKind,
Expand Down Expand Up @@ -36,6 +37,7 @@ import Webhook, { IWebhook } from '@/models/webhook';
import * as checkAlert from '@/tasks/checkAlerts';
import {
alertHasGroupBy,
computeAlertHistoryLookbackMs,
doesExceedThreshold,
getPreviousAlertHistories,
getScheduledWindowStart,
Expand Down Expand Up @@ -2115,7 +2117,7 @@ describe('checkAlerts', () => {
teamWebhooksById: Map<string, IWebhook>,
) => {
const previousMap = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
now,
);
await processAlert(
Expand Down Expand Up @@ -6206,7 +6208,7 @@ describe('checkAlerts', () => {
// Should NOT rescan 22:00-22:05 where service-b had data but was already checked
const nextRun = new Date('2023-11-16T22:23:00.000Z');
const previousMapNextRun = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
nextRun,
);

Expand Down Expand Up @@ -8746,7 +8748,7 @@ describe('checkAlerts', () => {

// Act - Run alerts twice to cover two periods
let previousMap = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
now,
);
await processAlert(
Expand All @@ -8763,7 +8765,7 @@ describe('checkAlerts', () => {

const nextWindow = new Date('2023-11-16T22:15:00.000Z');
previousMap = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
nextWindow,
);
await processAlert(
Expand Down Expand Up @@ -8890,7 +8892,7 @@ describe('checkAlerts', () => {

// Act - Run alerts twice to cover two periods
let previousMap = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
now,
);
await processAlert(
Expand All @@ -8907,7 +8909,7 @@ describe('checkAlerts', () => {

const nextWindow = new Date('2023-11-16T22:15:00.000Z');
previousMap = await getPreviousAlertHistories(
[details.alert.id],
[{ id: details.alert.id, interval: details.alert.interval }],
nextWindow,
);
await processAlert(
Expand Down Expand Up @@ -8969,6 +8971,46 @@ describe('checkAlerts', () => {
}).save();
};

const toLookups = (
ids: string[],
interval: AlertInterval = '5m',
) => ids.map(id => ({ id, interval }));

it('computes interval-based lookback with a floor and max cap', () => {
expect(computeAlertHistoryLookbackMs('1m')).toBe(ms('1h'));
expect(computeAlertHistoryLookbackMs('5m')).toBe(ms('1h'));
expect(computeAlertHistoryLookbackMs('1h')).toBe(ms('5h'));
expect(computeAlertHistoryLookbackMs('1d')).toBe(ms('5d'));
});

it('does not run the wide fallback for alerts with no history at all', async () => {
const alertId = new mongoose.Types.ObjectId();
const aggregateSpy = jest.spyOn(AlertHistory, 'aggregate');

const result = await getPreviousAlertHistories(
[{ id: alertId.toString(), interval: '1m' }],
new Date('2025-01-10T00:00:00Z'),
);

expect(result.get(alertId.toString())).toBeUndefined();
expect(aggregateSpy).toHaveBeenCalledTimes(1);
});

it('falls back to the max lookback when narrow window has no history', async () => {
const alertId = new mongoose.Types.ObjectId();
const now = new Date('2025-01-10T00:00:00Z');
await saveAlert(alertId, new Date('2025-01-07T12:00:00Z'));

const result = await getPreviousAlertHistories(
[{ id: alertId.toString(), interval: '1m' }],
now,
);

expect(result.get(alertId.toString())!.createdAt).toEqual(
new Date('2025-01-07T12:00:00Z'),
);
});

it('should return the latest alert history for each alert', async () => {
const alert1Id = new mongoose.Types.ObjectId();
await saveAlert(alert1Id, new Date('2025-01-01T00:00:00Z'));
Expand All @@ -8979,7 +9021,7 @@ describe('checkAlerts', () => {
await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z'));

const result = await getPreviousAlertHistories(
[alert1Id.toString(), alert2Id.toString()],
toLookups([alert1Id.toString(), alert2Id.toString()]),
new Date('2025-01-01T00:20:00Z'),
);

Expand All @@ -9002,7 +9044,7 @@ describe('checkAlerts', () => {
await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z')); // This one is in the future

const result = await getPreviousAlertHistories(
[alert1Id.toString(), alert2Id.toString()],
toLookups([alert1Id.toString(), alert2Id.toString()]),
new Date('2025-01-01T00:14:00Z'),
);

Expand All @@ -9023,7 +9065,7 @@ describe('checkAlerts', () => {
const alert2Id = new mongoose.Types.ObjectId();

const result = await getPreviousAlertHistories(
[alert1Id.toString(), alert2Id.toString()],
toLookups([alert1Id.toString(), alert2Id.toString()]),
new Date('2025-01-01T00:20:00Z'),
);

Expand All @@ -9044,7 +9086,7 @@ describe('checkAlerts', () => {
await saveAlert(alert2Id, new Date('2025-01-01T00:15:00Z'));

const result = await getPreviousAlertHistories(
[alert1Id.toString()],
toLookups([alert1Id.toString()]),
new Date('2025-01-01T00:20:00Z'),
);

Expand Down Expand Up @@ -9074,13 +9116,15 @@ describe('checkAlerts', () => {
alert2Id.toString(),
];

const allLookups = toLookups(allIds);

const result = await getPreviousAlertHistories(
allIds,
allLookups,
new Date('2025-01-01T00:20:00Z'),
);

// One aggregation per alert ID (no chunking)
expect(aggregateSpy).toHaveBeenCalledTimes(allIds.length);
// Alerts with recent history: one narrow query. Alerts with none: one narrow query only.
expect(aggregateSpy).toHaveBeenCalledTimes(2 + 150);
expect(result.size).toBe(2);
expect(result.get(alert1Id.toString())!.createdAt).toEqual(
new Date('2025-01-01T00:05:00Z'),
Expand Down
145 changes: 101 additions & 44 deletions packages/api/src/tasks/checkAlerts/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ import {
isRawSqlSavedChartConfig,
} from '@hyperdx/common-utils/dist/guards';
import {
ALERT_INTERVAL_TO_MINUTES,
AlertErrorType,
AlertInterval,
AlertThresholdType,
BuilderChartConfigWithOptDateRange,
ChartConfigWithOptDateRange,
Expand Down Expand Up @@ -1244,69 +1246,124 @@ export interface AggregatedAlertHistory {
group?: string;
}

export type AlertHistoryLookup = {
id: string;
interval: AlertInterval;
};

/** Number of check intervals to include in the narrow history lookback window. */
export const ALERT_HISTORY_LOOKBACK_INTERVAL_MULTIPLIER = 5;
/** Minimum narrow lookback regardless of alert interval. */
export const ALERT_HISTORY_LOOKBACK_FLOOR_MS = ms('1h');
/** Maximum lookback used when falling back from the narrow window. */
export const ALERT_HISTORY_LOOKBACK_MAX_MS = ms('7d');

export function computeAlertHistoryLookbackMs(interval: AlertInterval): number {
const intervalMs = ALERT_INTERVAL_TO_MINUTES[interval] * 60 * 1000;
return Math.min(
ALERT_HISTORY_LOOKBACK_MAX_MS,
Math.max(
ALERT_HISTORY_LOOKBACK_FLOOR_MS,
intervalMs * ALERT_HISTORY_LOOKBACK_INTERVAL_MULTIPLIER,
),
);
}

async function aggregatePreviousAlertHistoriesForAlert(
alertId: mongoose.Types.ObjectId,
now: Date,
lookbackMs: number,
): Promise<AggregatedAlertHistory[]> {
const lookbackDate = new Date(now.getTime() - lookbackMs);

return AlertHistory.aggregate<AggregatedAlertHistory>([
{
$match: {
alert: alertId,
createdAt: { $lte: now, $gte: lookbackDate },
},
},
// With a single alert value, the compound index {alert: 1, group: 1, createdAt: -1}
// delivers results already in this sort order — this is an index-backed no-op sort.
{
$sort: { alert: 1, group: 1, createdAt: -1 },
},
// Group by {alert, group}, taking the first (latest) document's fields.
// Using $first on individual fields instead of $first: '$$ROOT' allows
// DocumentDB to avoid fetching full documents when not needed.
{
$group: {
_id: {
alert: '$alert',
group: '$group',
},
createdAt: { $first: '$createdAt' },
state: { $first: '$state' },
},
},
{
$project: {
_id: '$_id.alert',
createdAt: 1,
state: 1,
group: '$_id.group',
},
},
]);
}

/**
* Fetch the most recent AlertHistory value for each of the given alert IDs.
* Fetch the most recent AlertHistory value for each of the given alerts.
* For group-by alerts, returns the latest history for each group within each alert.
*
* Uses per-alert queries instead of batched $in to leverage the compound index
* {alert: 1, group: 1, createdAt: -1} for index-backed sorting. With a single
* alert value, the index delivers results already sorted by {group, createdAt desc},
* so the $sort is a no-op and $group + $first can short-circuit per group.
* {alert: 1, group: 1, createdAt: -1} for index-backed sorting. The lookback
* window is sized from each alert's check interval (with a floor and a 7-day
* fallback when the narrow window returns no rows but older history exists).
*
* @param alertIds The list of alert IDs to query the latest history for.
* @param alerts Alert IDs and intervals to query the latest history for.
* @param now The current date and time. AlertHistory documents that have a createdAt > now are ignored.
* @returns A map from Alert IDs (or Alert ID + group) to their most recent AlertHistory.
* For non-grouped alerts, the key is just the alert ID.
* For grouped alerts, the key is "alertId||group" to track per-group state.
*/
export const getPreviousAlertHistories = async (
alertIds: string[],
alerts: AlertHistoryLookup[],
now: Date,
) => {
const lookbackDate = new Date(now.getTime() - ms('7d'));

// Use a concurrency-limited queue to avoid overwhelming the connection pool
// when there are many alerts (e.g., 200+ alert IDs).
const queue = new PQueue({ concurrency: ALERT_HISTORY_QUERY_CONCURRENCY });

const results = await Promise.all(
alertIds.map(alertId =>
alerts.map(({ id, interval }) =>
queue.add(async () => {
const id = new mongoose.Types.ObjectId(alertId);
return AlertHistory.aggregate<AggregatedAlertHistory>([
{
$match: {
alert: id,
createdAt: { $lte: now, $gte: lookbackDate },
},
},
// With a single alert value, the compound index {alert: 1, group: 1, createdAt: -1}
// delivers results already in this sort order — this is an index-backed no-op sort.
{
$sort: { alert: 1, group: 1, createdAt: -1 },
},
// Group by {alert, group}, taking the first (latest) document's fields.
// Using $first on individual fields instead of $first: '$$ROOT' allows
// DocumentDB to avoid fetching full documents when not needed.
{
$group: {
_id: {
alert: '$alert',
group: '$group',
},
createdAt: { $first: '$createdAt' },
state: { $first: '$state' },
},
},
{
$project: {
_id: '$_id.alert',
createdAt: 1,
state: 1,
group: '$_id.group',
},
},
]);
const alertId = new mongoose.Types.ObjectId(id);
const narrowLookbackMs = computeAlertHistoryLookbackMs(interval);
let histories = await aggregatePreviousAlertHistoriesForAlert(
alertId,
now,
narrowLookbackMs,
);

if (
histories.length === 0 &&
narrowLookbackMs < ALERT_HISTORY_LOOKBACK_MAX_MS
) {
const hasAnyHistory = await AlertHistory.exists({
alert: alertId,
createdAt: { $lte: now },
});
if (hasAnyHistory) {
histories = await aggregatePreviousAlertHistoriesForAlert(
alertId,
now,
ALERT_HISTORY_LOOKBACK_MAX_MS,
);
}
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

return histories;
}),
),
);
Expand Down
6 changes: 4 additions & 2 deletions packages/api/src/tasks/checkAlerts/providers/default.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,10 @@ export default class DefaultAlertProvider implements AlertProvider {
const alerts = await Alert.find({});

const now = new Date();
const alertIds = alerts.map(({ id }) => id);
const previousAlerts = await getPreviousAlertHistories(alertIds, now);
const previousAlerts = await getPreviousAlertHistories(
alerts.map(alert => ({ id: alert.id, interval: alert.interval })),
now,
);

for (const alert of alerts) {
try {
Expand Down
Loading