diff --git a/press/press/doctype/alertmanager_webhook_log/alertmanager_webhook_log.py b/press/press/doctype/alertmanager_webhook_log/alertmanager_webhook_log.py index 3118a561613..692529ef491 100644 --- a/press/press/doctype/alertmanager_webhook_log/alertmanager_webhook_log.py +++ b/press/press/doctype/alertmanager_webhook_log/alertmanager_webhook_log.py @@ -169,20 +169,27 @@ def get_labels_for_instance(self, instance: str) -> dict: return alert["labels"] return {} - def past_alert_instances(self, status: DF.Literal["Firing", "Resolved"]) -> set[str]: + def past_alert_instances( + self, + status: DF.Literal["Firing", "Resolved"], + since: str | None = None, + ) -> set[str]: + filters = { + "alert": self.alert, + "severity": self.severity, + "status": status, + "group_key": ("like", f"%{self.incident_scope}%"), + "modified": [ + ">", + add_to_date(frappe.utils.now(), hours=-self.get_repeat_interval()), + ], + } + if since: + filters["creation"] = [">", since] past_alerts = frappe.get_all( self.doctype, fields=["payload"], - filters={ - "alert": self.alert, - "severity": self.severity, - "status": status, - "group_key": ("like", f"%{self.incident_scope}%"), - "modified": [ - ">", - add_to_date(frappe.utils.now(), hours=-self.get_repeat_interval()), - ], - }, + filters=filters, group_by="group_key", ignore_ifnull=True, ) # get site down alerts grouped by benches @@ -199,14 +206,14 @@ def total_instances(self) -> int: {"status": "Active", INCIDENT_SCOPE: self.incident_scope}, ) - @property - def is_enough_firing(self): + def is_enough_firing(self, since: str | None = None): if self.status == "Resolved": firing_instances = len( - self.past_alert_instances("Firing") - self.past_alert_instances("Resolved") + self.past_alert_instances("Firing", since=since) + - self.past_alert_instances("Resolved", since=since) ) else: - firing_instances = len(self.past_alert_instances("Firing")) + firing_instances = len(self.past_alert_instances("Firing", since=since)) return firing_instances > min( math.floor(MIN_FIRING_INSTANCES_FRACTION * self.total_instances), MIN_FIRING_INSTANCES @@ -221,7 +228,7 @@ def validate_and_create_incident(self): rule: PrometheusAlertRule = frappe.get_doc("Prometheus Alert Rule", self.alert) if find(rule.ignore_on_clusters, lambda x: x.cluster == cluster): return - if self.is_enough_firing: + if self.is_enough_firing(): self.create_incident() def get_repeat_interval(self): diff --git a/press/press/doctype/incident/incident.py b/press/press/doctype/incident/incident.py index 8ee9c840f0c..b956a1e7799 100644 --- a/press/press/doctype/incident/incident.py +++ b/press/press/doctype/incident/incident.py @@ -789,12 +789,13 @@ def check_resolved(self): "status": "Resolved", "group_key": ("like", f"%{self.incident_scope}%"), "alert": self.alert, + "creation": (">", self.creation), }, ) except frappe.DoesNotExistError: return else: - if not last_resolved.is_enough_firing: + if not last_resolved.is_enough_firing(since=self.creation): self.create_log_for_server(is_resolved=True) self.resolve() diff --git a/press/press/doctype/incident/test_incident.py b/press/press/doctype/incident/test_incident.py index 9bda3449e94..57a7b8d3d79 100644 --- a/press/press/doctype/incident/test_incident.py +++ b/press/press/doctype/incident/test_incident.py @@ -370,6 +370,38 @@ def test_incident_gets_auto_resolved_when_resolved_alerts_fire(self): incident.reload() self.assertEqual(incident.status, "Auto-Resolved") + def test_subsequent_incident_not_resolved_by_previous_resolved_alerts(self): + """When a server goes down and recovers, subsequent incidents should not + auto-resolve due to resolved alerts from the previous recovery.""" + site = create_test_site() + alert = create_test_prometheus_alert_rule() + + # First incident: server goes down and recovers + create_test_alertmanager_webhook_log(site=site, alert=alert, status="firing") + first_incident: Incident = frappe.get_last_doc("Incident") + self.assertEqual(first_incident.status, "Validating") + create_test_alertmanager_webhook_log(site=site, alert=alert, status="resolved") + resolve_incidents() + first_incident.reload() + self.assertEqual(first_incident.status, "Auto-Resolved") + + # Second incident: server goes down again + create_test_alertmanager_webhook_log(site=site, alert=alert, status="firing") + second_incident: Incident = frappe.get_last_doc("Incident") + self.assertNotEqual(first_incident.name, second_incident.name) + self.assertEqual(second_incident.status, "Validating") + + # Resolve should NOT auto-resolve the second incident using old resolved alerts + resolve_incidents() + second_incident.reload() + self.assertEqual(second_incident.status, "Validating") + + # Only when new resolved alerts come in after the second incident + create_test_alertmanager_webhook_log(site=site, alert=alert, status="resolved") + resolve_incidents() + second_incident.reload() + self.assertEqual(second_incident.status, "Auto-Resolved") + @given(get_total_and_firing_for_ongoing_incident()) @settings(max_examples=20, deadline=timedelta(seconds=5)) def test_is_enough_firing_is_true_for_ongoing_incident(self, total_firing): @@ -381,10 +413,10 @@ def test_is_enough_firing_is_true_for_ongoing_incident(self, total_firing): patch.object( AlertmanagerWebhookLog, "past_alert_instances", - new=lambda x, y: firing_instances, + new=lambda x, y, since=None: firing_instances, ), ): - self.assertTrue(alert.is_enough_firing) + self.assertTrue(alert.is_enough_firing()) @given(get_total_firing_and_resolved_for_resolved_incident()) @settings(max_examples=20, deadline=timedelta(seconds=5)) @@ -402,7 +434,7 @@ def test_is_enough_firing_is_false_for_resolved_incident(self, total_firing_reso side_effect=[firing_instances, resolved_instances], ), ): - self.assertFalse(alert.is_enough_firing) + self.assertFalse(alert.is_enough_firing()) def test_incident_does_not_resolve_when_other_alerts_are_still_firing_but_does_when_less_than_required_sites_are_down( self,