Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion control_plane/src/bin/neon_local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1415,7 +1415,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
}

if !args.allow_multiple {
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id, None)?;
}

cplane.new_endpoint(
Expand Down Expand Up @@ -1464,6 +1464,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint.mode,
endpoint.tenant_id,
endpoint.timeline_id,
Some(endpoint_id),
)?;
}

Expand Down
38 changes: 37 additions & 1 deletion control_plane/src/endpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,12 +285,18 @@ impl ComputeControlPlane {
mode: ComputeMode,
tenant_id: TenantId,
timeline_id: TimelineId,
exclude_id: Option<&str>,
) -> Result<()> {
if matches!(mode, ComputeMode::Primary) {
// this check is not complete, as you could have a concurrent attempt at
// creating another primary, both reading the state before checking it here,
// but it's better than nothing.
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
let mut duplicates = self.endpoints.iter().filter(|(k, v)| {
if let Some(exclude) = exclude_id {
if k.as_str() == exclude {
return false;
}
}
v.tenant_id == tenant_id
&& v.timeline_id == timeline_id
&& v.mode == mode
Expand Down Expand Up @@ -1142,6 +1148,36 @@ impl Endpoint {
mode: EndpointTerminateMode,
destroy: bool,
) -> Result<TerminateResponse> {
// If the endpoint has crashed (stale pidfile, process already dead),
// skip pg_ctl stop and just clean up. Verify the postmaster is truly
// gone before deleting its pidfile, because Crashed is a heuristic
// (300ms TCP timeout) and could false-positive during startup.
// Note: kill(pid, None) only checks process existence (ESRCH check),
// it does not send a signal.
if self.status() == EndpointStatus::Crashed {
let pidfile = self.pgdata().join("postmaster.pid");
let postmaster_alive = std::fs::read_to_string(&pidfile)
.ok()
.and_then(|s| s.lines().next()?.trim().parse::<i32>().ok())
.map(|pid| kill(nix::unistd::Pid::from_raw(pid), None).is_ok())
.unwrap_or(false);
if !postmaster_alive {
// postgres is dead; make sure compute_ctl follows it down
// before cleanup. wait_until_stopped handles dead PIDs
// immediately (ESRCH → returns on first retry), so it is
// safe to call unconditionally.
let _ = self.wait_for_compute_ctl_to_exit(true);
if destroy {
std::fs::remove_dir_all(self.endpoint_path())?;
} else {
let _ = std::fs::remove_file(&pidfile);
}
return Ok(TerminateResponse { lsn: None });
}
// Falls through: postmaster is alive despite Crashed heuristic;
// proceed with normal stop path.
}

// pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is
// slow, and test runs time out. Solution: special mode "immediate-terminate"
// which uses /terminate
Expand Down