Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion internal/launcher/health_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func NewHealthMonitor(l *Launcher, interval time.Duration) *HealthMonitor {
if interval <= 0 {
interval = DefaultHealthCheckInterval
}
logHealth.Printf("Creating health monitor: interval=%v, maxRestartFailures=%d", interval, maxConsecutiveRestartFailures)
return &HealthMonitor{
launcher: l,
interval: interval,
Expand All @@ -53,6 +54,7 @@ func (hm *HealthMonitor) Start() {

// Stop signals the health monitor to stop and waits for it to finish.
func (hm *HealthMonitor) Stop() {
logHealth.Print("Stopping health monitor, waiting for background goroutine to finish")
close(hm.stopCh)
<-hm.doneCh
logHealth.Print("Health monitor stopped")
Expand All @@ -61,6 +63,7 @@ func (hm *HealthMonitor) Stop() {

func (hm *HealthMonitor) run() {
defer close(hm.doneCh)
logHealth.Printf("Health monitor goroutine started: interval=%v", hm.interval)

ticker := time.NewTicker(hm.interval)
defer ticker.Stop()
Expand All @@ -80,7 +83,9 @@ func (hm *HealthMonitor) run() {
// checkAll iterates over every configured backend and attempts to restart
// any server that is in an error state.
func (hm *HealthMonitor) checkAll() {
for _, serverID := range hm.launcher.ServerIDs() {
serverIDs := hm.launcher.ServerIDs()
logHealth.Printf("Running health check: checking %d servers", len(serverIDs))
for _, serverID := range serverIDs {
state := hm.launcher.GetServerState(serverID)

switch state.Status {
Expand All @@ -89,6 +94,7 @@ func (hm *HealthMonitor) checkAll() {
case "running":
// Reset consecutive failure counter on healthy server.
if hm.consecutiveFailures[serverID] > 0 {
logHealth.Printf("Server recovered: resetting failure counter for serverID=%s (was %d)", serverID, hm.consecutiveFailures[serverID])
hm.consecutiveFailures[serverID] = 0
}
}
Expand All @@ -99,6 +105,7 @@ func (hm *HealthMonitor) handleErrorState(serverID string, state ServerState) {
failures := hm.consecutiveFailures[serverID]
if failures >= maxConsecutiveRestartFailures {
// Already logged when the threshold was reached; stay silent.
logHealth.Printf("Skipping restart for serverID=%s: max failures reached (%d/%d)", serverID, failures, maxConsecutiveRestartFailures)
return
Comment on lines 106 to 109
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The failures >= maxConsecutiveRestartFailures branch is documented as “stay silent” and the PR description says the restart is “silently skipped”, but this adds a debug log that will fire on every health-check tick for a permanently-failed server (potential log spam when DEBUG enables launcher:health). Consider removing this log, or only logging once at the moment the threshold is reached (and keep the comment/PR description consistent).

Copilot uses AI. Check for mistakes.
}

Expand Down
Loading