From 08da78b42c04075067d1e140f18cd1976f612b4c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 6 Apr 2026 05:15:47 +0000
Subject: [PATCH] log: add debug logging to health monitor

Add 6 meaningful debug log calls to internal/launcher/health_monitor.go
using the existing logHealth logger (launcher:health namespace):

- NewHealthMonitor: log creation with interval and max restart failures
- Stop: log when stop is initiated (before blocking on doneCh)
- run: log goroutine startup with interval
- checkAll: log total servers being checked each cycle
- checkAll: log when a recovered server's failure counter is reset
- handleErrorState: log when max failures reached and restart is skipped

These additions improve troubleshooting of health monitoring behavior
during development and production incident investigation.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 internal/launcher/health_monitor.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/internal/launcher/health_monitor.go b/internal/launcher/health_monitor.go
index c4864e35..4bfd4205 100644
--- a/internal/launcher/health_monitor.go
+++ b/internal/launcher/health_monitor.go
@@ -35,6 +35,7 @@ func NewHealthMonitor(l *Launcher, interval time.Duration) *HealthMonitor {
 	if interval <= 0 {
 		interval = DefaultHealthCheckInterval
 	}
+	logHealth.Printf("Creating health monitor: interval=%v, maxRestartFailures=%d", interval, maxConsecutiveRestartFailures)
 	return &HealthMonitor{
 		launcher:            l,
 		interval:            interval,
@@ -53,6 +54,7 @@ func (hm *HealthMonitor) Start() {
 
 // Stop signals the health monitor to stop and waits for it to finish.
 func (hm *HealthMonitor) Stop() {
+	logHealth.Print("Stopping health monitor, waiting for background goroutine to finish")
 	close(hm.stopCh)
 	<-hm.doneCh
 	logHealth.Print("Health monitor stopped")
@@ -61,6 +63,7 @@ func (hm *HealthMonitor) Stop() {
 
 func (hm *HealthMonitor) run() {
 	defer close(hm.doneCh)
+	logHealth.Printf("Health monitor goroutine started: interval=%v", hm.interval)
 
 	ticker := time.NewTicker(hm.interval)
 	defer ticker.Stop()
@@ -80,7 +83,9 @@ func (hm *HealthMonitor) run() {
 // checkAll iterates over every configured backend and attempts to restart
 // any server that is in an error state.
 func (hm *HealthMonitor) checkAll() {
-	for _, serverID := range hm.launcher.ServerIDs() {
+	serverIDs := hm.launcher.ServerIDs()
+	logHealth.Printf("Running health check: checking %d servers", len(serverIDs))
+	for _, serverID := range serverIDs {
 		state := hm.launcher.GetServerState(serverID)
 
 		switch state.Status {
@@ -89,6 +94,7 @@ func (hm *HealthMonitor) checkAll() {
 		case "running":
 			// Reset consecutive failure counter on healthy server.
 			if hm.consecutiveFailures[serverID] > 0 {
+				logHealth.Printf("Server recovered: resetting failure counter for serverID=%s (was %d)", serverID, hm.consecutiveFailures[serverID])
 				hm.consecutiveFailures[serverID] = 0
 			}
 		}
@@ -99,6 +105,7 @@ func (hm *HealthMonitor) handleErrorState(serverID string, state ServerState) {
 	failures := hm.consecutiveFailures[serverID]
 	if failures >= maxConsecutiveRestartFailures {
 		// Already logged when the threshold was reached; stay silent.
+		logHealth.Printf("Skipping restart for serverID=%s: max failures reached (%d/%d)", serverID, failures, maxConsecutiveRestartFailures)
 		return
 	}