diff --git a/cmd/entire/cli/checkpoint/remote/git.go b/cmd/entire/cli/checkpoint/remote/git.go index 1fdd55ccaf..1fab50539e 100644 --- a/cmd/entire/cli/checkpoint/remote/git.go +++ b/cmd/entire/cli/checkpoint/remote/git.go @@ -10,10 +10,18 @@ import ( "strconv" "strings" "sync" + "time" + "github.com/entireio/cli/cmd/entire/cli/execx" "github.com/entireio/cli/cmd/entire/cli/settings" ) +// cancelWaitDelay caps how long Wait blocks reading from inherited pipes after +// the context is cancelled and the process group has been killed. Five seconds +// is generous enough to drain legitimate buffered output but short enough to +// keep a stuck push from holding the hook open indefinitely. +const cancelWaitDelay = 5 * time.Second + // CheckpointTokenEnvVar is the environment variable for providing an access token // used to authenticate git push/fetch operations for checkpoint branches. // The token is injected as an HTTP Basic Authorization header per RFC 7617: @@ -209,6 +217,11 @@ func newCommand(ctx context.Context, args ...string) *exec.Cmd { mkCmd := func(finalArgs []string) *exec.Cmd { c := exec.CommandContext(ctx, "git", finalArgs...) c.Stdin = nil // Disconnect stdin to prevent hanging in hook context + // git push/fetch over HTTPS spawns helpers (git-remote-https, credential + // helpers) that inherit our stdio pipes. Without process-group kill + + // WaitDelay, ctx cancellation only SIGKILLs git itself; the helpers keep + // the pipe open and CombinedOutput blocks indefinitely. + execx.KillOnCancel(c, cancelWaitDelay) return c } diff --git a/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go b/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go new file mode 100644 index 0000000000..e64674d948 --- /dev/null +++ b/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go @@ -0,0 +1,29 @@ +//go:build unix + +package remote + +import ( + "context" + "testing" +) + +// newCommand must wire up KillOnCancel so that git push/fetch subprocesses +// don't strand grandchildren (git-remote-https, credential helpers) holding +// the inherited stdio pipes when their context fires. Without this, +// tryPushSessionsCommon's 2-minute timeout was effectively unbounded — the +// real-world symptom that motivated this guard. +func TestNewCommand_AppliesKillOnCancel(t *testing.T) { + t.Parallel() + + cmd := newCommand(context.Background(), "status") + + if cmd.SysProcAttr == nil || !cmd.SysProcAttr.Setpgid { + t.Fatalf("newCommand should set Setpgid for process-group kill; got %+v", cmd.SysProcAttr) + } + if cmd.Cancel == nil { + t.Fatal("newCommand should set Cancel to kill the process group") + } + if cmd.WaitDelay <= 0 { + t.Fatalf("newCommand should set WaitDelay; got %s", cmd.WaitDelay) + } +} diff --git a/cmd/entire/cli/execx/killoncancel_unix.go b/cmd/entire/cli/execx/killoncancel_unix.go new file mode 100644 index 0000000000..03e5aa15af --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_unix.go @@ -0,0 +1,31 @@ +//go:build unix + +package execx + +import ( + "os/exec" + "syscall" + "time" +) + +// KillOnCancel configures cmd so that when its context is cancelled the entire +// process group is terminated, and any inherited stdio pipes are forcibly +// closed after waitDelay. This is necessary for subprocesses that fork helpers +// (e.g. `git push` over HTTPS spawns `git-remote-https` and credential +// helpers): SIGKILL on the direct child leaves grandchildren holding the +// inherited pipes, and CombinedOutput stays blocked on Read until they exit on +// their own. Setpgid lets Cancel kill the whole group; WaitDelay is a backstop +// in case a process slips out of the group somehow. +func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Setpgid = true + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + } + cmd.WaitDelay = waitDelay +} diff --git a/cmd/entire/cli/execx/killoncancel_unix_test.go b/cmd/entire/cli/execx/killoncancel_unix_test.go new file mode 100644 index 0000000000..c97b912bd6 --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_unix_test.go @@ -0,0 +1,86 @@ +//go:build unix + +package execx + +import ( + "context" + "os/exec" + "testing" + "time" +) + +// grandchildHoldingStdoutCmd returns a shell snippet that backgrounds a sleep +// (the grandchild) before waiting indefinitely in the parent. The grandchild +// inherits the parent's stdout, so any pipe-reading caller blocks on it long +// after the parent has been killed. The sleep is short enough that the +// regression-pinning test costs ~3s rather than 5s on every CI run while +// still leaving a clear gap above the 2s assertion threshold. +const grandchildHoldingStdoutCmd = "sleep 3 & echo started; wait" + +// Regression: exec.CommandContext alone does not unblock CombinedOutput when +// the killed child has grandchildren still holding the inherited stdout pipe. +// This test pins that behavior so reviewers can see why KillOnCancel exists. +// If a future Go release fixes this in the stdlib, this assertion fails and +// KillOnCancel can be reconsidered. +func TestExecCommandContext_HangsOnGrandchildHoldingPipe(t *testing.T) { + t.Parallel() + if _, err := exec.LookPath("sh"); err != nil { + t.Skip("sh not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + cmd := exec.CommandContext(ctx, "sh", "-c", grandchildHoldingStdoutCmd) + + start := time.Now() + // We expect CombinedOutput to return with an error (killed by ctx) — the + // point of the test is the duration, so the error is intentionally ignored. + _, _ = cmd.CombinedOutput() //nolint:errcheck // duration is what we assert on + elapsed := time.Since(start) + + if elapsed < 2*time.Second { + t.Fatalf("expected CombinedOutput to hang on grandchild stdout; got elapsed=%s (Go may have fixed this — consider removing KillOnCancel)", elapsed) + } +} + +// KillOnCancel must terminate the whole process group and release the pipes +// shortly after the context fires, even when grandchildren hold stdout. +func TestKillOnCancel_TerminatesGrandchildHoldingPipe(t *testing.T) { + t.Parallel() + if _, err := exec.LookPath("sh"); err != nil { + t.Skip("sh not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + cmd := exec.CommandContext(ctx, "sh", "-c", grandchildHoldingStdoutCmd) + KillOnCancel(cmd, 500*time.Millisecond) + + start := time.Now() + _, err := cmd.CombinedOutput() + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected error from cancelled command") + } + if elapsed > 2*time.Second { + t.Fatalf("KillOnCancel should unblock CombinedOutput well before the grandchild exits; got elapsed=%s", elapsed) + } +} + +func TestKillOnCancel_SetsSetpgid(t *testing.T) { + t.Parallel() + cmd := exec.CommandContext(context.Background(), "/bin/true") + KillOnCancel(cmd, time.Second) + if cmd.SysProcAttr == nil || !cmd.SysProcAttr.Setpgid { + t.Fatalf("KillOnCancel did not set Setpgid: %+v", cmd.SysProcAttr) + } + if cmd.WaitDelay != time.Second { + t.Fatalf("KillOnCancel WaitDelay = %s; want 1s", cmd.WaitDelay) + } + if cmd.Cancel == nil { + t.Fatal("KillOnCancel did not set Cancel") + } +} diff --git a/cmd/entire/cli/execx/killoncancel_windows.go b/cmd/entire/cli/execx/killoncancel_windows.go new file mode 100644 index 0000000000..693855d528 --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_windows.go @@ -0,0 +1,47 @@ +//go:build windows + +package execx + +import ( + "fmt" + "os/exec" + "strconv" + "syscall" + "time" +) + +// KillOnCancel configures cmd so that when its context is cancelled the +// process and any descendants are terminated and any inherited stdio pipes +// are forcibly closed after waitDelay. +// +// Windows specifics: CREATE_NEW_PROCESS_GROUP isolates the child from the +// parent's console signals, but neither it nor Process.Kill terminates +// descendants — git over HTTPS spawns git-remote-https plus credential +// helpers that survive a direct kill and keep the inherited stdio open. We +// shell out to taskkill.exe /F /T (force + tree) so the whole descendant +// chain dies. If taskkill is unavailable, we fall back to the direct kill +// and rely on waitDelay to bound the user-visible hang. +func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.CreationFlags |= syscall.CREATE_NEW_PROCESS_GROUP + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + kill := exec.Command("taskkill.exe", "/F", "/T", "/PID", strconv.Itoa(cmd.Process.Pid)) + taskkillErr := kill.Run() + if taskkillErr == nil { + return nil + } + // taskkill is missing or refused — fall back to direct kill so at + // least the visible child terminates. WaitDelay still closes the + // pipes within bounded time, so the caller does not hang. + if killErr := cmd.Process.Kill(); killErr != nil { + return fmt.Errorf("taskkill: %w; fallback kill: %w", taskkillErr, killErr) + } + return taskkillErr + } + cmd.WaitDelay = waitDelay +} diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 2243d346d6..dc1c39a01e 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -22,6 +22,38 @@ import ( "github.com/go-git/go-git/v6/plumbing/object" ) +// pushStderr is the destination for output emitted by doPushBranch itself +// (the dot lines, "Warning:" lines, hints). The standalone hint helpers +// (printCheckpointRemoteHint, printSettingsCommitHint, …) deliberately keep +// writing to os.Stderr because they pre-date this var and have their own +// inline-redirect tests. captureStderr patches both so end-to-end tests of +// doPushBranch still see hint output. +var pushStderr io.Writer = os.Stderr + +// pushAttemptTimeout caps a single tryPushSessionsCommon invocation. Two +// minutes is generous for a healthy network and short enough that a hung +// helper does not hold the hook open indefinitely. +var pushAttemptTimeout = 2 * time.Minute + +var fetchAttemptTimeout = 2 * time.Minute + +// errPushTimedOut signals that tryPushSessionsCommon's inner deadline fired +// before git push completed. doPushBranch uses it to suppress the +// sync-and-retry cascade — fetch would just time out on the same condition. +var errPushTimedOut = errors.New("push timed out") + +var errFetchTimedOut = errors.New("fetch timed out") + +// Log-field values for the "class" attribute on push-related INFO logs. +// Kept as constants so log consumers and tests share one source of truth and +// the function stays free of bare string literals. +const ( + pushClassTimedOut = "timed_out" + pushClassProtectedRef = "protected_ref" + pushClassNonFastForward = "non_fast_forward" + pushClassOther = "other" +) + // pushBranchIfNeeded pushes a branch to the given target if it has unpushed changes. // The target can be a remote name (e.g., "origin") or a URL for direct push. // When pushing to a URL, the "has unpushed" optimization is skipped since there are @@ -78,49 +110,172 @@ func displayPushTarget(target string) string { func doPushBranch(ctx context.Context, target, branchName string) error { displayTarget := displayPushTarget(target) - fmt.Fprintf(os.Stderr, "[entire] Pushing %s to %s...", branchName, displayTarget) - stop := startProgressDots(os.Stderr) + fmt.Fprintf(pushStderr, "[entire] Pushing %s to %s...", branchName, displayTarget) + stop := startProgressDots(pushStderr) + + pushStart := time.Now() + logging.Info(ctx, "push attempt start", + slog.String("branch", branchName), + slog.String("target", displayTarget), + ) - // Try pushing first result, err := tryPushSessionsCommon(ctx, target, branchName) if err == nil { + logging.Info(ctx, "push attempt completed", + slog.String("branch", branchName), + slog.String("target", displayTarget), + slog.Duration("elapsed", time.Since(pushStart)), + slog.Bool("up_to_date", result.upToDate), + ) finishPush(ctx, stop, result, target) return nil } + + logging.Info(ctx, "push attempt failed", + slog.String("branch", branchName), + slog.String("target", displayTarget), + slog.Duration("elapsed", time.Since(pushStart)), + slog.String("class", classifyForLog(err)), + ) + + // Outer context cancelled (user Ctrl-C, hook timeout) — bail quietly + // rather than cascading into a sync that will also fail. doPushBranch + // is best-effort: never fail the user's main push because of checkpoint + // sync, mirroring the existing graceful-degradation pattern below. + if ctx.Err() != nil { + stop("") + return nil //nolint:nilerr // intentional graceful degradation + } + + // Inner attempt timed out. Sync will also time out, so skip the cascade + // and give the user a clear next step instead. + if errors.Is(err, errPushTimedOut) { + stop(" timed out") + fmt.Fprintf(pushStderr, "[entire] Warning: push of %s to %s timed out after %s. Network or proxy issue?\n", + branchName, displayTarget, pushAttemptTimeout) + fmt.Fprintln(pushStderr, `[entire] Hint: set "log_level": "DEBUG" in .entire/settings.local.json to capture git's underlying output next time.`) + printCheckpointRemoteHint(target) + return nil + } + stop("") // Protected refs cannot be fixed by syncing and retrying. var protectedErr *protectedRefError if errors.As(err, &protectedErr) { - printProtectedRefBlock(os.Stderr, branchName, target) + printProtectedRefBlock(pushStderr, branchName, target) return nil } - // Push failed - likely non-fast-forward. Try to fetch and rebase. - fmt.Fprintf(os.Stderr, "[entire] Syncing %s with remote...", branchName) - stop = startProgressDots(os.Stderr) - - if err := fetchAndRebaseSessionsCommon(ctx, target, branchName); err != nil { - stop("") - fmt.Fprintf(os.Stderr, "[entire] Warning: couldn't sync %s: %v\n", branchName, err) - printCheckpointRemoteHint(target) - return nil // Don't fail the main push + fmt.Fprintf(pushStderr, "[entire] Syncing %s with remote...", branchName) + stop = startProgressDots(pushStderr) + syncStart := time.Now() + + if syncErr := fetchAndRebaseSessionsCommon(ctx, target, branchName); syncErr != nil { + reportPushFailure(ctx, reportPushFailureArgs{ + out: pushStderr, + stop: stop, + err: syncErr, + logMsg: "push sync failed", + warnMsg: fmt.Sprintf("[entire] Warning: couldn't sync %s: %v\n", branchName, syncErr), + branch: branchName, + target: target, + start: syncStart, + }) + return nil } stop(" done") - // Try pushing again after rebase - fmt.Fprintf(os.Stderr, "[entire] Pushing %s to %s...", branchName, displayTarget) - stop = startProgressDots(os.Stderr) + fmt.Fprintf(pushStderr, "[entire] Pushing %s to %s...", branchName, displayTarget) + stop = startProgressDots(pushStderr) + retryStart := time.Now() + + retryResult, retryErr := tryPushSessionsCommon(ctx, target, branchName) + if retryErr != nil { + reportPushFailure(ctx, reportPushFailureArgs{ + out: pushStderr, + stop: stop, + err: retryErr, + logMsg: "push retry failed", + warnMsg: fmt.Sprintf("[entire] Warning: failed to push %s after sync: %v\n", branchName, retryErr), + branch: branchName, + target: target, + start: retryStart, + }) + return nil + } + logging.Info(ctx, "push retry completed", + slog.String("branch", branchName), + slog.String("target", displayTarget), + slog.Duration("elapsed", time.Since(retryStart)), + slog.Bool("up_to_date", retryResult.upToDate), + ) + finishPush(ctx, stop, retryResult, target) + return nil +} - if result, err := tryPushSessionsCommon(ctx, target, branchName); err != nil { - stop("") - fmt.Fprintf(os.Stderr, "[entire] Warning: failed to push %s after sync: %v\n", branchName, err) - printCheckpointRemoteHint(target) - } else { - finishPush(ctx, stop, result, target) +// reportPushFailureArgs groups the inputs to reportPushFailure to keep the +// recoverable-failure paths in doPushBranch readable. out is taken as a +// parameter (rather than reaching into the package-global pushStderr) so +// tests can capture without racing on shared state. +type reportPushFailureArgs struct { + out io.Writer + stop func(string) + err error + logMsg string + warnMsg string + branch string + target string + start time.Time +} + +// reportPushFailure handles the common tail of a failed sync or retry-push: +// pick the dot-line suffix (`" timed out"` vs blank), log at INFO with a +// classification label, print the warning, and surface the +// checkpoint-remote hint when the target is a URL. +// +// If the outer context is done (user Ctrl-C or hook deadline arrived after +// the first push attempt), this collapses to a silent bail: empty dot suffix, +// no log, no warning. Matches the early bail-out in doPushBranch's +// first-attempt path so that every cancellation point behaves the same. +func reportPushFailure(ctx context.Context, a reportPushFailureArgs) { + if ctx.Err() != nil { + a.stop("") + return } + suffix := "" + if errors.Is(a.err, errPushTimedOut) || errors.Is(a.err, errFetchTimedOut) { + suffix = " timed out" + } + a.stop(suffix) + logging.Info(ctx, a.logMsg, + slog.String("branch", a.branch), + slog.String("target", displayPushTarget(a.target)), + slog.Duration("elapsed", time.Since(a.start)), + slog.String("class", classifyForLog(a.err)), + ) + fmt.Fprint(a.out, a.warnMsg) + printCheckpointRemoteHint(a.target) +} - return nil +// classifyForLog maps a push/fetch error to a short string suitable for log +// fields. Avoids leaking the raw error (which may contain URLs or tokens +// embedded in git's output) — only the category is logged. +func classifyForLog(err error) string { + if err == nil { + return "" + } + if errors.Is(err, errPushTimedOut) || errors.Is(err, errFetchTimedOut) { + return pushClassTimedOut + } + var protectedErr *protectedRefError + if errors.As(err, &protectedErr) { + return pushClassProtectedRef + } + if errors.Is(err, errNonFastForward) { + return pushClassNonFastForward + } + return pushClassOther } // printCheckpointRemoteHint prints a hint when a push to a checkpoint URL fails. @@ -280,13 +435,22 @@ func finishPush(ctx context.Context, stop func(string), result pushResult, targe // tryPushSessionsCommon attempts to push the sessions branch. func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) (pushResult, error) { - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + localCtx, cancel := context.WithTimeout(ctx, pushAttemptTimeout) defer cancel() - result, err := remote.Push(ctx, remoteName, branchName) + result, err := remote.Push(localCtx, remoteName, branchName) outputStr := result.Output if err != nil { - return pushResult{}, classifyPushFailure(ctx, outputStr, err) + // Inner deadline fired: distinct sentinel so doPushBranch can pick + // the right messaging and skip the sync-retry cascade. + if errors.Is(localCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil { + logging.Debug(ctx, "git push timed out", + slog.String("error", err.Error()), + slog.Duration("after", pushAttemptTimeout), + ) + return pushResult{}, errPushTimedOut + } + return pushResult{}, classifyPushFailure(localCtx, outputStr, err) } return parsePushResult(outputStr), nil @@ -375,10 +539,10 @@ func printProtectedRefBlock(w io.Writer, ref, target string) { // always apply cleanly. // The target can be a remote name or a URL. func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string) error { - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + localCtx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) defer cancel() - fetchTarget, err := remote.ResolveFetchTarget(ctx, target) + fetchTarget, err := remote.ResolveFetchTarget(localCtx, target) if err != nil { return fmt.Errorf("resolve fetch target: %w", err) } @@ -402,15 +566,34 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // Use --filter=blob:none for a partial fetch that downloads only commits // and trees, skipping blobs. The merge only needs the tree structure to // combine entries; blobs are already local or fetched on demand. - if output, fetchErr := remote.Fetch(ctx, remote.FetchOptions{ + if output, fetchErr := remote.Fetch(localCtx, remote.FetchOptions{ Remote: fetchTarget, RefSpecs: []string{refSpec}, NoTags: true, }); fetchErr != nil { - return fmt.Errorf("fetch failed: %s", output) + // Inner deadline fired (but outer is still alive): distinct sentinel + // so doPushBranch can render " timed out" instead of an empty trailing + // colon. When the outer context fired, let the wrapped error propagate + // so the caller sees the real cancellation cause instead of an inner- + // timeout mislabel. + if errors.Is(localCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil { + logging.Debug(ctx, "git fetch timed out", + slog.String("error", fetchErr.Error()), + slog.Duration("after", fetchAttemptTimeout), + ) + return errFetchTimedOut + } + msg := strings.TrimSpace(string(output)) + if msg == "" { + // When git is SIGKILLed (kill-on-cancel from execx.KillOnCancel) the + // child writes nothing to stderr; fall back to the wrapper's error + // text so users don't see a bare "fetch failed:" with no detail. + msg = fetchErr.Error() + } + return fmt.Errorf("fetch failed: %s", msg) } - repo, err := OpenRepository(ctx) + repo, err := OpenRepository(localCtx) if err != nil { return fmt.Errorf("failed to open git repository: %w", err) } @@ -421,7 +604,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // this cherry-picks local commits onto remote tip, updating the local ref. // If reconciliation fails, abort — proceeding to rebase on disconnected // branches would silently combine unrelated histories. - if reconcileErr := ReconcileDisconnectedMetadataBranch(ctx, repo, fetchedRefName, os.Stderr); reconcileErr != nil { + if reconcileErr := ReconcileDisconnectedMetadataBranch(localCtx, repo, fetchedRefName, pushStderr); reconcileErr != nil { return fmt.Errorf("metadata reconciliation failed: %w", reconcileErr) } @@ -447,7 +630,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string if err != nil { return fmt.Errorf("failed to get repo path: %w", err) } - mergeBase, err := getMergeBase(ctx, repoPath, localRef.Hash().String(), remoteRef.Hash().String()) + mergeBase, err := getMergeBase(localCtx, repoPath, localRef.Hash().String(), remoteRef.Hash().String()) if err != nil { return fmt.Errorf("failed to find merge base: %w", err) } @@ -468,7 +651,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // them onto the remote tip. This preserves local-only commits even when the // local metadata branch already contains old merge commits, while avoiding // replaying shared ancestors older than the true merge-base. - localCommits, err := collectCommitsSince(ctx, repo, repoPath, localRef.Hash(), remoteRef.Hash()) + localCommits, err := collectCommitsSince(localCtx, repo, repoPath, localRef.Hash(), remoteRef.Hash()) if err != nil { return fmt.Errorf("failed to collect local commits: %w", err) } @@ -485,7 +668,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string return nil } - newTip, err := cherryPickOnto(ctx, repo, remoteRef.Hash(), localCommits) + newTip, err := cherryPickOnto(localCtx, repo, remoteRef.Hash(), localCommits) if err != nil { return fmt.Errorf("failed to rebase local commits onto remote: %w", err) } diff --git a/cmd/entire/cli/strategy/push_common_test.go b/cmd/entire/cli/strategy/push_common_test.go index 95eec80336..7da69d828b 100644 --- a/cmd/entire/cli/strategy/push_common_test.go +++ b/cmd/entire/cli/strategy/push_common_test.go @@ -1406,15 +1406,21 @@ func TestHasUnmigratedV1Checkpoints(t *testing.T) { func captureStderr(t *testing.T) func() string { t.Helper() old := os.Stderr + oldPushStderr := pushStderr r, w, err := os.Pipe() require.NoError(t, err) os.Stderr = w + // push_common.go writes user-facing output to the package-level pushStderr, + // not directly to os.Stderr. Patch both so legacy tests that only redirect + // os.Stderr still see the same output captured. + pushStderr = w // Safety net: restore stderr and close pipe ends on test failure/panic. // In the normal path the returned function handles cleanup first; // duplicate Close calls return an error that we intentionally ignore. t.Cleanup(func() { os.Stderr = old + pushStderr = oldPushStderr _ = w.Close() _ = r.Close() }) @@ -1426,6 +1432,7 @@ func captureStderr(t *testing.T) func() string { require.NoError(t, readErr) _ = r.Close() os.Stderr = old + pushStderr = oldPushStderr return buf.String() } } diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go new file mode 100644 index 0000000000..7ee44ed8f5 --- /dev/null +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -0,0 +1,224 @@ +package strategy + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/logging" + "github.com/entireio/cli/cmd/entire/cli/paths" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// withShortPushTimeout replaces pushAttemptTimeout for the test duration and +// restores it via t.Cleanup. Lets us trigger errPushTimedOut without waiting +// the production 2-minute deadline. +func withShortPushTimeout(t *testing.T, d time.Duration) { + t.Helper() + original := pushAttemptTimeout + pushAttemptTimeout = d + t.Cleanup(func() { pushAttemptTimeout = original }) +} + +// Regression: a colleague's terminal once showed "Warning: couldn't sync …: +// fetch failed:" with nothing after the colon, because the wrapping discarded +// git's underlying error whenever git produced no stdout. The wrapped error +// must now appear in the message even when output is empty. +// +// Not parallel: t.Chdir() for OpenRepository. +func TestFetchAndRebase_FetchFailure_IncludesUnderlyingError(t *testing.T) { + tmpDir := setupRepoWithCheckpointBranch(t) + t.Chdir(tmpDir) + + nonExistent := filepath.Join(t.TempDir(), "does-not-exist") + err := fetchAndRebaseSessionsCommon(context.Background(), nonExistent, paths.MetadataBranchName) + + require.Error(t, err) + msg := err.Error() + require.True(t, strings.HasPrefix(msg, "fetch failed: "), "expected fetch failed prefix, got %q", msg) + detail := strings.TrimPrefix(msg, "fetch failed: ") + require.NotEmpty(t, strings.TrimSpace(detail), "fetch failed must carry a non-empty cause; got %q", msg) +} + +// Regression: when the outer context already fired its deadline (e.g. a +// hook-level timeout shorter than fetchAttemptTimeout), the inner localCtx +// also reports DeadlineExceeded because it derives from outer. A naive check +// against the inner ctx alone would misreport this as errFetchTimedOut and +// swallow the real outer cancellation cause. The fix gates the sentinel on +// the outer ctx still being alive. +// +// Not parallel: t.Chdir() for OpenRepository. +func TestFetchAndRebase_OuterDeadlineFired_DoesNotReturnFetchTimedOut(t *testing.T) { + tmpDir := setupRepoWithCheckpointBranch(t) + t.Chdir(tmpDir) + + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(-time.Hour)) + defer cancel() + + nonExistent := filepath.Join(t.TempDir(), "does-not-exist") + err := fetchAndRebaseSessionsCommon(ctx, nonExistent, paths.MetadataBranchName) + require.Error(t, err) + assert.NotErrorIs(t, err, errFetchTimedOut, + "outer-context deadline must not be misreported as inner fetch timeout; got %v", err) +} + +// On inner-push timeout, doPushBranch must end the dot line with " timed out", +// skip the sync→retry cascade (same network condition would also trip the +// fetch deadline), and surface the DEBUG-logging hint. +// +// Not parallel: t.Chdir() and stderr capture. +func TestDoPushBranch_TimedOut_PrintsTimedOutAndSkipsSync(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + // Force every push attempt to time out immediately. KillOnCancel + + // WaitDelay ensures the call returns within milliseconds. + // Zero timeout makes localCtx fire DeadlineExceeded synchronously, so the + // remote.Push call sees an already-cancelled context and we exercise the + // errPushTimedOut path without racing the network or local push. + withShortPushTimeout(t, 0) + + restore := captureStderr(t) + err := doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err, "doPushBranch should swallow the error (graceful degradation)") + assert.Contains(t, output, "timed out", "dot line must end with 'timed out'") + assert.NotContains(t, output, "Syncing", "must not cascade into sync on inner-push timeout") + assert.Contains(t, output, "log_level", "should hint at enabling DEBUG logging") +} + +// When the outer context is already cancelled (user Ctrl-C, hook deadline), +// doPushBranch must bail without a misleading sync cascade or warning. +// +// Not parallel: t.Chdir() and stderr capture. +func TestDoPushBranch_OuterContextCancelled_BailsQuietly(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + restore := captureStderr(t) + err := doPushBranch(ctx, bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err) + assert.NotContains(t, output, "Syncing", "must not run sync after outer cancel") + assert.NotContains(t, output, "Warning:", "must not print sync/retry warnings after outer cancel") +} + +// Push attempts must reach INFO so doctor bundles capture push history at the +// default log level. Without this, a bundle from a stuck push reveals nothing +// about what was attempted. +// +// Not parallel: t.Chdir() and global logger state. +func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + require.NoError(t, logging.Init(context.Background(), "2026-05-12-push-log-test")) + t.Cleanup(logging.Close) + + restore := captureStderr(t) + err := doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + _ = restore() + require.NoError(t, err) + + logging.Close() // flush + + logBytes, err := os.ReadFile(filepath.Join(workDir, ".entire", "logs", "entire.log")) + require.NoError(t, err) + logText := string(logBytes) + + assert.Contains(t, logText, "push attempt start", "log must record push start") + assert.Contains(t, logText, "push attempt completed", "log must record push completion") + assert.Contains(t, logText, `"branch":"`+paths.MetadataBranchName+`"`, "log must include branch") + // Doctor bundles need the target on completion entries too, not just on + // the start record, so a stuck or retried push can be correlated. + assert.Contains(t, logText, `"target":"`+bareDir+`"`, "completion log must include target") +} + +// reportPushFailure is the single chokepoint for the sync- and retry-stage +// failure tails of doPushBranch. If the outer context is done by the time +// it runs (Ctrl-C / hook deadline that arrived after the first push attempt +// completed), it must collapse to a silent bail — empty dot suffix, no log, +// no warning, no hint. Without this, late cancellation can still produce +// misleading "Warning: couldn't sync …" output. +// +// Safe under t.Parallel because reportPushFailureArgs.out lets us inject a +// per-test writer instead of mutating package-global pushStderr. +func TestReportPushFailure_BailsOnOuterContextCancel(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + var suffixes []string + var buf strings.Builder + + reportPushFailure(ctx, reportPushFailureArgs{ + out: &buf, + stop: func(s string) { suffixes = append(suffixes, s) }, + err: errors.New("simulated sync failure"), + logMsg: "push sync failed", + warnMsg: "[entire] Warning: couldn't sync foo: …\n", + branch: "entire/checkpoints/v1", + target: "origin", + start: time.Now(), + }) + + assert.Equal(t, []string{""}, suffixes, "stop must be called with empty suffix to clear the dot line") + assert.Empty(t, buf.String(), "no warning should be emitted after outer cancel") +} + +// Pins the errPushTimedOut sentinel so future refactors of doPushBranch's +// cascade logic can keep relying on errors.Is(err, errPushTimedOut). +// +// Not parallel: t.Chdir(). +func TestTryPushSessionsCommon_InnerTimeout_ReturnsErrPushTimedOut(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + // Zero timeout makes localCtx fire DeadlineExceeded synchronously, so the + // remote.Push call sees an already-cancelled context and we exercise the + // errPushTimedOut path without racing the network or local push. + withShortPushTimeout(t, 0) + + _, err := tryPushSessionsCommon(context.Background(), bareDir, paths.MetadataBranchName) + require.Error(t, err) + assert.ErrorIs(t, err, errPushTimedOut, + "err must wrap errPushTimedOut; got %v (class=%q)", err, classifyForLog(err)) +} + +// classifyForLog must keep logs free of raw push output (which can contain +// URLs and tokens). Anything we don't recognize collapses to "other". +func TestClassifyForLog(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + err error + want string + }{ + {"nil", nil, ""}, + {"push timeout", errPushTimedOut, pushClassTimedOut}, + {"fetch timeout", errFetchTimedOut, pushClassTimedOut}, + {"non-fast-forward", errNonFastForward, pushClassNonFastForward}, + {"protected", &protectedRefError{output: "GH013"}, pushClassProtectedRef}, + {"other", errors.New("some random error"), pushClassOther}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tc.want, classifyForLog(tc.err)) + }) + } +}