From 08da78b42c04075067d1e140f18cd1976f612b4c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 6 Apr 2026 05:15:47 +0000 Subject: [PATCH] log: add debug logging to health monitor Add 6 meaningful debug log calls to internal/launcher/health_monitor.go using the existing logHealth logger (launcher:health namespace): - NewHealthMonitor: log creation with interval and max restart failures - Stop: log when stop is initiated (before blocking on doneCh) - run: log goroutine startup with interval - checkAll: log total servers being checked each cycle - checkAll: log when a recovered server's failure counter is reset - handleErrorState: log when max failures reached and restart is skipped These additions improve troubleshooting of health monitoring behavior during development and production incident investigation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/launcher/health_monitor.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/launcher/health_monitor.go b/internal/launcher/health_monitor.go index c4864e35..4bfd4205 100644 --- a/internal/launcher/health_monitor.go +++ b/internal/launcher/health_monitor.go @@ -35,6 +35,7 @@ func NewHealthMonitor(l *Launcher, interval time.Duration) *HealthMonitor { if interval <= 0 { interval = DefaultHealthCheckInterval } + logHealth.Printf("Creating health monitor: interval=%v, maxRestartFailures=%d", interval, maxConsecutiveRestartFailures) return &HealthMonitor{ launcher: l, interval: interval, @@ -53,6 +54,7 @@ func (hm *HealthMonitor) Start() { // Stop signals the health monitor to stop and waits for it to finish. func (hm *HealthMonitor) Stop() { + logHealth.Print("Stopping health monitor, waiting for background goroutine to finish") close(hm.stopCh) <-hm.doneCh logHealth.Print("Health monitor stopped") @@ -61,6 +63,7 @@ func (hm *HealthMonitor) Stop() { func (hm *HealthMonitor) run() { defer close(hm.doneCh) + logHealth.Printf("Health monitor goroutine started: interval=%v", hm.interval) ticker := time.NewTicker(hm.interval) defer ticker.Stop() @@ -80,7 +83,9 @@ func (hm *HealthMonitor) run() { // checkAll iterates over every configured backend and attempts to restart // any server that is in an error state. func (hm *HealthMonitor) checkAll() { - for _, serverID := range hm.launcher.ServerIDs() { + serverIDs := hm.launcher.ServerIDs() + logHealth.Printf("Running health check: checking %d servers", len(serverIDs)) + for _, serverID := range serverIDs { state := hm.launcher.GetServerState(serverID) switch state.Status { @@ -89,6 +94,7 @@ func (hm *HealthMonitor) checkAll() { case "running": // Reset consecutive failure counter on healthy server. if hm.consecutiveFailures[serverID] > 0 { + logHealth.Printf("Server recovered: resetting failure counter for serverID=%s (was %d)", serverID, hm.consecutiveFailures[serverID]) hm.consecutiveFailures[serverID] = 0 } } @@ -99,6 +105,7 @@ func (hm *HealthMonitor) handleErrorState(serverID string, state ServerState) { failures := hm.consecutiveFailures[serverID] if failures >= maxConsecutiveRestartFailures { // Already logged when the threshold was reached; stay silent. + logHealth.Printf("Skipping restart for serverID=%s: max failures reached (%d/%d)", serverID, failures, maxConsecutiveRestartFailures) return }