diff --git a/AGENTS.md b/AGENTS.md index 0c8904a..4297740 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -73,9 +73,9 @@ When a change introduces new config fields, CLI flags, subcommands, or user-visi |------|----------------| | `README.md` | Features list, CLI reference table, Configuration section | | `docs/configuration.md` | Config field reference (top-level table, field section, variant table) | -| `skills/veld-config/SKILL.md` | Agent-facing config reference | -| `skills/veld-usage/SKILL.md` | Agent-facing CLI reference | -| `schema/v1/veld.schema.json` | JSON Schema (usually updated in code, but verify) | +| `skills/veld/SKILL.md` | Agent-facing skill (quick reference, gotchas) | +| `skills/veld/reference/config.md` | Agent-facing config reference | +| `schema/v2/veld.schema.json` | JSON Schema for v2 configs (probes, recovery, skip_if) | | `website/llms-full.txt` | LLM-facing docs (if applicable, see `website/AGENTS.md`) | If the change is purely internal (refactor, bugfix with no new surface area), this checklist does not apply. diff --git a/Cargo.lock b/Cargo.lock index c7cad96..d70ff4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1862,7 +1862,7 @@ checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "veld" -version = "6.4.0" +version = "6.5.1" dependencies = [ "anyhow", "chrono", @@ -1882,7 +1882,7 @@ dependencies = [ [[package]] name = "veld-core" -version = "6.4.0" +version = "6.5.1" dependencies = [ "anyhow", "base64", @@ -1904,13 +1904,14 @@ dependencies = [ [[package]] name = "veld-daemon" -version = "6.4.0" +version = "6.5.1" dependencies = [ "anyhow", "axum", "chrono", "dirs", "libc", + "reqwest", "serde", "serde_json", "thiserror", @@ -1923,7 +1924,7 @@ dependencies = [ [[package]] name = "veld-helper" -version = "6.4.0" +version = "6.5.1" dependencies = [ "anyhow", "nix", diff --git a/PRD.md b/PRD.md index c4c5336..39bfd19 100644 --- a/PRD.md +++ b/PRD.md @@ -724,7 +724,7 @@ No cross-compilation until v1 is stable. No Tauri. No GTK. No npm in CI. "local": { "type": "command", "script": "./scripts/clone-db.sh", - "verify": "./scripts/verify-db.sh", + "skip_if": "./scripts/verify-db.sh", "outputs": ["DATABASE_URL"], "sensitive_outputs": ["DATABASE_URL"] }, diff --git a/README.md b/README.md index 2542d43..d4022f0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ No port numbers. No manual wiring. Just clean, stable, human-readable URLs. - **No port numbers** — work with stable HTTPS URLs instead of `localhost:3847` - **Dependency graph** — resolves node dependencies, parallelizes startup, reverse-order teardown - **TLS by default** — Caddy's internal CA handles TLS termination, auto-trusted during setup -- **Health checks** — two-phase checks (TCP port + HTTP endpoint) before marking services healthy +- **Health checks** — readiness probes (two-phase: TCP port + HTTP/command) gate startup; liveness probes detect failures after startup (e.g., dropped SSH tunnels) +- **Automatic recovery** — when liveness probes detect failure, the environment is automatically restarted (configurable failure threshold and max recovery attempts) - **Multiple variants** — same node, different behaviors (local server, Docker, remote URL) - **Named runs** — multiple environments coexist; re-running by name is idempotent - **Setup / teardown** — project-level lifecycle steps that gate startup (check Docker, create networks) and clean up after stop @@ -26,6 +27,7 @@ No port numbers. No manual wiring. Just clean, stable, human-readable URLs. - **Structured output** — all commands support `--json` for scripting and CI - **Browser dashboard** — management UI at `https://veld.localhost` with service health, logs, search, stop/restart - **Client-side logs** — captures browser `console.log/warn/error`, exceptions, and promise rejections; view with `veld logs --source client` +- **Internal logs** — liveness probe outcomes (with stderr), recovery decisions, health state transitions; view with `veld logs --source internal` ## Install @@ -72,8 +74,8 @@ cargo build --release ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "myproject", "url_template": "{service}.{run}.{project}.localhost", "nodes": { @@ -83,7 +85,7 @@ cargo build --release "local": { "type": "start_server", "command": "npm run dev -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 30 } + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 30 } } } } }, @@ -93,7 +95,7 @@ cargo build --release "local": { "type": "start_server", "command": "npm run dev -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "http", "path": "/", "timeout_seconds": 30 } }, "depends_on": { "backend": "local" }, "env": { "NEXT_PUBLIC_API_URL": "${nodes.backend.url}" } } @@ -152,7 +154,7 @@ veld stop --name dev ### Step types - **`start_server`** — long-running process. Veld allocates a port (`${veld.port}`), starts the process, and runs health checks. -- **`command`** — runs a command to completion. Can emit outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged). Optional `verify` command for idempotency. +- **`command`** — runs a command to completion. Can emit outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged). Optional `skip_if` command for idempotency. ### Setup & teardown diff --git a/crates/veld-core/src/config.rs b/crates/veld-core/src/config.rs index 4f78c4b..78c36b0 100644 --- a/crates/veld-core/src/config.rs +++ b/crates/veld-core/src/config.rs @@ -165,10 +165,16 @@ pub struct VariantConfig { #[serde(default, skip_serializing_if = "Option::is_none")] pub script: Option, - /// Health check configuration (start_server only). + /// Legacy health check configuration (start_server only). + /// Deprecated: use `probes.readiness` instead. #[serde(default, skip_serializing_if = "Option::is_none")] pub health_check: Option, + /// Readiness and liveness probe configuration. + /// `probes.readiness` supersedes the legacy `health_check` field. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub probes: Option, + /// Dependencies: node name -> variant name. #[serde(default, skip_serializing_if = "Option::is_none")] pub depends_on: Option>, @@ -193,9 +199,10 @@ pub struct VariantConfig { #[serde(default = "default_strict_outputs")] pub strict_outputs: bool, - /// Idempotency verify command (command steps only). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub verify: Option, + /// Idempotency check — skip this command step if this command exits 0. + /// Previously named `verify` (still accepted for backward compatibility). + #[serde(default, skip_serializing_if = "Option::is_none", alias = "verify")] + pub skip_if: Option, /// Optional URL template override for this specific variant. #[serde(default, skip_serializing_if = "Option::is_none")] @@ -414,6 +421,86 @@ pub struct HealthCheck { pub interval_ms: u64, } +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- +// Probes +// --------------------------------------------------------------------------- + +/// Readiness and liveness probe configuration for a variant. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ProbesConfig { + /// Readiness probe — gates the dependency graph during startup. + /// Same semantics as the legacy `health_check` field. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub readiness: Option, + + /// Liveness probe — runs continuously after the node is healthy. + /// Triggers recovery when `failure_threshold` consecutive checks fail. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub liveness: Option, +} + +/// Liveness probe configuration. Shares check-type fields with `HealthCheck` +/// but adds failure thresholds and recovery limits. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LivenessProbe { + /// One of "http", "port", "command". + #[serde(rename = "type")] + pub check_type: String, + + /// HTTP path for type "http". + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, + + /// Expected HTTP status code. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub expect_status: Option, + + /// Command for type "command". + #[serde(default, skip_serializing_if = "Option::is_none")] + pub command: Option, + + /// Milliseconds between liveness checks (default 5000). + #[serde(default = "default_liveness_interval")] + pub interval_ms: u64, + + /// Consecutive failures before triggering recovery (default 3). + #[serde(default = "default_failure_threshold")] + pub failure_threshold: u32, + + /// Maximum number of recovery attempts before permanent failure (default 3). + #[serde(default = "default_max_recoveries")] + pub max_recoveries: u32, +} + +fn default_liveness_interval() -> u64 { + 5000 +} + +fn default_failure_threshold() -> u32 { + 3 +} + +fn default_max_recoveries() -> u32 { + 3 +} + +impl VariantConfig { + /// Resolve the effective readiness probe: `probes.readiness` takes + /// precedence over the legacy `health_check` field. + pub fn readiness_probe(&self) -> Option<&HealthCheck> { + self.probes + .as_ref() + .and_then(|p| p.readiness.as_ref()) + .or(self.health_check.as_ref()) + } + + /// Return the liveness probe if configured. + pub fn liveness_probe(&self) -> Option<&LivenessProbe> { + self.probes.as_ref().and_then(|p| p.liveness.as_ref()) + } +} + fn default_strict_outputs() -> bool { true } @@ -457,7 +544,7 @@ pub fn load_config(path: &Path) -> Result { source: e, })?; - if config.schema_version != "1" { + if config.schema_version != "1" && config.schema_version != "2" { return Err(ConfigError::UnsupportedSchemaVersion( config.schema_version.clone(), )); @@ -862,4 +949,155 @@ mod tests { assert!(config.setup.is_none()); assert!(config.teardown.is_none()); } + + // -- Probes config tests --------------------------------------------------- + + #[test] + fn test_probes_config_deserialization() { + let json = r#"{ + "readiness": { + "type": "http", + "path": "/health", + "timeout_seconds": 30, + "interval_ms": 500 + }, + "liveness": { + "type": "command", + "command": "pg_isready", + "interval_ms": 5000, + "failure_threshold": 5, + "max_recoveries": 2 + } + }"#; + let probes: ProbesConfig = serde_json::from_str(json).unwrap(); + let readiness = probes.readiness.unwrap(); + assert_eq!(readiness.check_type, "http"); + assert_eq!(readiness.path.as_deref(), Some("/health")); + assert_eq!(readiness.timeout_seconds, 30); + + let liveness = probes.liveness.unwrap(); + assert_eq!(liveness.check_type, "command"); + assert_eq!(liveness.command.as_deref(), Some("pg_isready")); + assert_eq!(liveness.interval_ms, 5000); + assert_eq!(liveness.failure_threshold, 5); + assert_eq!(liveness.max_recoveries, 2); + } + + #[test] + fn test_liveness_probe_defaults() { + let json = r#"{"type": "command", "command": "true"}"#; + let liveness: LivenessProbe = serde_json::from_str(json).unwrap(); + assert_eq!(liveness.interval_ms, 5000); + assert_eq!(liveness.failure_threshold, 3); + assert_eq!(liveness.max_recoveries, 3); + } + + // -- skip_if / verify alias tests ------------------------------------------ + + #[test] + fn test_skip_if_field() { + let json = r#"{ + "type": "command", + "command": "echo run", + "skip_if": "test -f /tmp/done" + }"#; + let v: VariantConfig = serde_json::from_str(json).unwrap(); + assert_eq!(v.skip_if.as_deref(), Some("test -f /tmp/done")); + } + + #[test] + fn test_verify_alias_for_skip_if() { + let json = r#"{ + "type": "command", + "command": "echo run", + "verify": "test -f /tmp/done" + }"#; + let v: VariantConfig = serde_json::from_str(json).unwrap(); + assert_eq!(v.skip_if.as_deref(), Some("test -f /tmp/done")); + } + + // -- Schema version tests -------------------------------------------------- + + #[test] + fn test_schema_version_2_accepted() { + let json = r#"{ + "schemaVersion": "2", + "name": "test-project", + "nodes": { + "db": { + "variants": { + "local": { + "type": "command", + "command": "echo start", + "probes": { + "liveness": { + "type": "command", + "command": "pg_isready" + } + } + } + } + } + } + }"#; + let config: VeldConfig = serde_json::from_str(json).unwrap(); + assert_eq!(config.schema_version, "2"); + let variant = &config.nodes["db"].variants["local"]; + assert!(variant.probes.is_some()); + let liveness = variant.liveness_probe().unwrap(); + assert_eq!(liveness.check_type, "command"); + } + + // -- Readiness probe helper tests ------------------------------------------ + + #[test] + fn test_readiness_probe_from_probes() { + let json = r#"{ + "type": "start_server", + "command": "npm start", + "probes": { + "readiness": { + "type": "http", + "path": "/health" + } + } + }"#; + let v: VariantConfig = serde_json::from_str(json).unwrap(); + let probe = v.readiness_probe().unwrap(); + assert_eq!(probe.check_type, "http"); + } + + #[test] + fn test_readiness_probe_fallback_to_health_check() { + let json = r#"{ + "type": "start_server", + "command": "npm start", + "health_check": { + "type": "port" + } + }"#; + let v: VariantConfig = serde_json::from_str(json).unwrap(); + let probe = v.readiness_probe().unwrap(); + assert_eq!(probe.check_type, "port"); + } + + #[test] + fn test_readiness_probe_probes_overrides_health_check() { + let json = r#"{ + "type": "start_server", + "command": "npm start", + "health_check": { + "type": "port" + }, + "probes": { + "readiness": { + "type": "http", + "path": "/ready" + } + } + }"#; + let v: VariantConfig = serde_json::from_str(json).unwrap(); + let probe = v.readiness_probe().unwrap(); + assert_eq!(probe.check_type, "http"); + } } diff --git a/crates/veld-core/src/graph.rs b/crates/veld-core/src/graph.rs index 755d765..f8123fc 100644 --- a/crates/veld-core/src/graph.rs +++ b/crates/veld-core/src/graph.rs @@ -374,6 +374,58 @@ fn validate_sensitive_outputs( Ok(()) } +/// Return all nodes that transitively depend on `target` within the given +/// set of active nodes. The result is in topological order (direct dependents +/// first, transitive dependents later). +pub fn get_dependents( + target: &NodeSelection, + all_nodes: &[NodeSelection], + config: &VeldConfig, +) -> Vec { + // Build reverse adjacency: for each node, which nodes depend on it. + let mut reverse_deps: HashMap> = HashMap::new(); + for sel in all_nodes { + let variant_cfg = &config.nodes[&sel.node].variants[&sel.variant]; + if let Some(dep_map) = &variant_cfg.depends_on { + for (dep_node, dep_variant) in dep_map { + let dep_key = format!("{dep_node}:{dep_variant}"); + reverse_deps.entry(dep_key).or_default().push(sel.clone()); + } + } + } + + // BFS from target through reverse edges. + let mut visited: HashSet = HashSet::new(); + let mut queue: VecDeque = VecDeque::new(); + let target_key = format!("{}:{}", target.node, target.variant); + visited.insert(target_key.clone()); + + if let Some(direct) = reverse_deps.get(&target_key) { + for dep in direct { + let key = format!("{}:{}", dep.node, dep.variant); + if visited.insert(key) { + queue.push_back(dep.clone()); + } + } + } + + let mut result = Vec::new(); + while let Some(sel) = queue.pop_front() { + let key = format!("{}:{}", sel.node, sel.variant); + if let Some(further) = reverse_deps.get(&key) { + for dep in further { + let dep_key = format!("{}:{}", dep.node, dep.variant); + if visited.insert(dep_key) { + queue.push_back(dep.clone()); + } + } + } + result.push(sel); + } + + result +} + fn check_string_for_ambiguous_refs( s: &str, active_variants: &HashMap<&str, Vec<&str>>, @@ -410,3 +462,201 @@ fn check_string_for_ambiguous_refs( } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::{NodeConfig, StepType, VariantConfig, VeldConfig}; + use std::collections::HashMap; + + fn make_config() -> VeldConfig { + // db -> api -> frontend (dependency chain) + let db_variant = VariantConfig { + step_type: StepType::Command, + command: Some("echo db".into()), + script: None, + health_check: None, + probes: None, + depends_on: None, + env: None, + outputs: None, + sensitive_outputs: None, + strict_outputs: true, + skip_if: None, + url_template: None, + on_stop: None, + client_log_levels: None, + features: None, + cwd: None, + }; + let api_variant = VariantConfig { + step_type: StepType::StartServer, + command: Some("echo api".into()), + script: None, + health_check: None, + probes: None, + depends_on: Some(HashMap::from([("db".into(), "local".into())])), + env: None, + outputs: None, + sensitive_outputs: None, + strict_outputs: true, + skip_if: None, + url_template: None, + on_stop: None, + client_log_levels: None, + features: None, + cwd: None, + }; + let frontend_variant = VariantConfig { + step_type: StepType::StartServer, + command: Some("echo fe".into()), + script: None, + health_check: None, + probes: None, + depends_on: Some(HashMap::from([("api".into(), "local".into())])), + env: None, + outputs: None, + sensitive_outputs: None, + strict_outputs: true, + skip_if: None, + url_template: None, + on_stop: None, + client_log_levels: None, + features: None, + cwd: None, + }; + + VeldConfig { + schema: None, + schema_version: "2".into(), + name: "test".into(), + url_template: "{service}.{run}.{project}.localhost".into(), + presets: None, + client_log_levels: None, + features: None, + env: None, + setup: None, + teardown: None, + nodes: HashMap::from([ + ( + "db".into(), + NodeConfig { + default_variant: Some("local".into()), + url_template: None, + hidden: None, + client_log_levels: None, + features: None, + env: None, + cwd: None, + variants: HashMap::from([("local".into(), db_variant)]), + }, + ), + ( + "api".into(), + NodeConfig { + default_variant: Some("local".into()), + url_template: None, + hidden: None, + client_log_levels: None, + features: None, + env: None, + cwd: None, + variants: HashMap::from([("local".into(), api_variant)]), + }, + ), + ( + "frontend".into(), + NodeConfig { + default_variant: Some("local".into()), + url_template: None, + hidden: None, + client_log_levels: None, + features: None, + env: None, + cwd: None, + variants: HashMap::from([("local".into(), frontend_variant)]), + }, + ), + ]), + } + } + + #[test] + fn test_get_dependents_leaf_node() { + let config = make_config(); + let all_nodes = vec![ + NodeSelection { + node: "db".into(), + variant: "local".into(), + }, + NodeSelection { + node: "api".into(), + variant: "local".into(), + }, + NodeSelection { + node: "frontend".into(), + variant: "local".into(), + }, + ]; + let target = NodeSelection { + node: "frontend".into(), + variant: "local".into(), + }; + let deps = get_dependents(&target, &all_nodes, &config); + assert!(deps.is_empty(), "leaf node should have no dependents"); + } + + #[test] + fn test_get_dependents_root_node() { + let config = make_config(); + let all_nodes = vec![ + NodeSelection { + node: "db".into(), + variant: "local".into(), + }, + NodeSelection { + node: "api".into(), + variant: "local".into(), + }, + NodeSelection { + node: "frontend".into(), + variant: "local".into(), + }, + ]; + let target = NodeSelection { + node: "db".into(), + variant: "local".into(), + }; + let deps = get_dependents(&target, &all_nodes, &config); + assert_eq!(deps.len(), 2); + let dep_names: Vec = deps.iter().map(|d| d.node.clone()).collect(); + assert!(dep_names.contains(&"api".to_string())); + assert!(dep_names.contains(&"frontend".to_string())); + } + + #[test] + fn test_get_dependents_middle_node() { + let config = make_config(); + let all_nodes = vec![ + NodeSelection { + node: "db".into(), + variant: "local".into(), + }, + NodeSelection { + node: "api".into(), + variant: "local".into(), + }, + NodeSelection { + node: "frontend".into(), + variant: "local".into(), + }, + ]; + let target = NodeSelection { + node: "api".into(), + variant: "local".into(), + }; + let deps = get_dependents(&target, &all_nodes, &config); + assert_eq!(deps.len(), 1); + assert_eq!(deps[0].node, "frontend"); + } +} diff --git a/crates/veld-core/src/logging.rs b/crates/veld-core/src/logging.rs index 27be4cd..b7f15fe 100644 --- a/crates/veld-core/src/logging.rs +++ b/crates/veld-core/src/logging.rs @@ -59,6 +59,12 @@ pub fn debug_log_file(project_root: &Path, run_name: &str) -> PathBuf { log_dir(project_root, run_name).join("veld-debug.log") } +/// Return the internal (veld daemon/orchestrator) log file for a run. +/// Contains liveness probe outcomes, recovery decisions, health transitions. +pub fn internal_log_file(project_root: &Path, run_name: &str) -> PathBuf { + log_dir(project_root, run_name).join("_veld.log") +} + /// Return a temporary output file path for a command node. /// /// Scripts write `key=value` lines to this file instead of emitting diff --git a/crates/veld-core/src/orchestrator.rs b/crates/veld-core/src/orchestrator.rs index 2599a31..795f1fd 100644 --- a/crates/veld-core/src/orchestrator.rs +++ b/crates/veld-core/src/orchestrator.rs @@ -17,7 +17,7 @@ use crate::port::PortAllocator; use crate::process; use crate::progress::ProgressEvent; use crate::state::{ - GlobalRegistry, HealthCheckPhase, NodeState, NodeStatus, ProjectState, RegistryEntry, + GlobalRegistry, NodeState, NodeStatus, ProjectState, ReadinessPhase, RegistryEntry, RegistryRunInfo, RunState, RunStatus, }; use crate::url; @@ -164,6 +164,8 @@ pub struct Orchestrator { foreground: bool, /// Optional channel for live progress events. progress_tx: Option>, + /// Internal log writer for the current run (liveness/recovery/lifecycle events). + internal_log: Option, } impl Orchestrator { @@ -183,6 +185,7 @@ impl Orchestrator { debug_writer: None, foreground: false, progress_tx: None, + internal_log: None, } } @@ -220,6 +223,13 @@ impl Orchestrator { } } + /// Write a line to the internal log (per-run lifecycle events). + async fn internal_log(&self, message: &str) { + if let Some(ref writer) = self.internal_log { + let _ = writer.write_line(message).await; + } + } + /// Convenience: discover config from CWD and build the orchestrator. pub fn from_cwd() -> Result { let (path, cfg) = config::load_config_from_cwd()?; @@ -260,6 +270,12 @@ impl Orchestrator { } } + // Create internal log writer for this run. + let log_path = logging::internal_log_file(&self.project_root, run_name); + if let Ok(writer) = LogWriter::new(log_path).await { + self.internal_log = Some(writer); + } + let resolved = graph::resolve_selections(selections, &self.config)?; let plan = graph::build_execution_plan(&resolved, &self.config)?; @@ -394,6 +410,13 @@ impl Orchestrator { // Count total nodes for progress reporting. let total_nodes: usize = plan.iter().map(|s| s.len()).sum(); + self.internal_log(&format!( + "[start] starting environment '{}' — {} node(s) in {} stage(s)", + run_name, + total_nodes, + plan.len() + )) + .await; self.emit(ProgressEvent::PlanResolved { total_nodes, stages: plan.len(), @@ -436,6 +459,10 @@ impl Orchestrator { } .await; + if let Err(ref e) = execute_result { + self.internal_log(&format!("[start] startup failed: {e}")) + .await; + } if let Err(e) = execute_result { // Release all remaining port reservations so the ports become // available to the system immediately. @@ -451,6 +478,12 @@ impl Orchestrator { // Final state save with Running status. self.save_state(&run)?; + self.internal_log(&format!( + "[start] environment '{}' is running — all {} node(s) healthy", + run_name, total_nodes + )) + .await; + Ok(run) } @@ -588,6 +621,16 @@ impl Orchestrator { /// Stop a run in reverse dependency order. Returns whether the run was /// actually stopped or was already stopped. pub async fn stop(&mut self, run_name: &str) -> Result { + // Create internal log writer for this run (may already exist from start). + if self.internal_log.is_none() { + let log_path = logging::internal_log_file(&self.project_root, run_name); + if let Ok(writer) = LogWriter::new(log_path).await { + self.internal_log = Some(writer); + } + } + self.internal_log(&format!("[stop] stopping environment '{run_name}'")) + .await; + // Reconnect to whichever helper is running (system or user socket) if let Ok(client) = crate::helper::HelperClient::connect().await { self.helper_client = client; @@ -633,6 +676,12 @@ impl Orchestrator { for key in node_keys.iter().rev() { if let Some(node_state) = run.nodes.get_mut(key) { + self.internal_log(&format!( + "[stop] stopping {}:{} (pid: {:?})", + node_state.node_name, node_state.variant, node_state.pid + )) + .await; + // Kill process if running. if let Some(pid) = node_state.pid { if process::is_alive(pid) { @@ -678,6 +727,9 @@ impl Orchestrator { // Remove from global registry. self.remove_from_registry(run_name); + self.internal_log(&format!("[stop] environment '{run_name}' stopped")) + .await; + Ok(StopResult::Stopped) } @@ -1131,7 +1183,7 @@ async fn debug_log_free(writer: &Option, message: &str) { } } -/// Build a health-check attempt notifier that sends progress events. +/// Build a readiness probe attempt notifier that sends progress events. fn make_attempt_notifier( tx: &Option>, node: &str, @@ -1143,7 +1195,7 @@ fn make_attempt_notifier( let variant = variant.to_owned(); Box::new(move |attempt| { if let Some(tx) = &tx { - let _ = tx.send(ProgressEvent::HealthCheckAttempt { + let _ = tx.send(ProgressEvent::ReadinessProbeAttempt { node: node.clone(), variant: variant.clone(), phase, @@ -1456,24 +1508,26 @@ async fn execute_start_server_isolated( let _ = project_state.save(&project_root); } - // Health check — inlined to emit progress events between phases. + // Readiness probe — inlined to emit progress events between phases. debug_log_free( &ctx.debug_writer, &format!( - "{}:{} — process started (pid {}), beginning health checks", + "{}:{} — process started (pid {}), beginning readiness checks", sel.node, sel.variant, pid ), ) .await; - if let Some(ref hc) = variant_cfg.health_check { + // Use probes.readiness if available, falling back to legacy health_check. + if let Some(hc) = variant_cfg.readiness_probe() { + let hc = hc.clone(); node_state.status = NodeStatus::HealthChecking; - node_state.health_phases.push(HealthCheckPhase { + node_state.readiness_phases.push(ReadinessPhase { phase: 1, passed: false, last_error: None, passed_at: None, }); - node_state.health_phases.push(HealthCheckPhase { + node_state.readiness_phases.push(ReadinessPhase { phase: 2, passed: false, last_error: None, @@ -1535,7 +1589,7 @@ async fn execute_start_server_isolated( // Phase 1: TCP port check. emit_progress( &ctx.progress_tx, - ProgressEvent::HealthCheckPhase { + ProgressEvent::ReadinessProbePhase { node: sel.node.clone(), variant: sel.variant.clone(), phase: 1, @@ -1544,7 +1598,7 @@ async fn execute_start_server_isolated( ); let phase1_result = tokio::select! { - result = health::wait_for_port(port, hc, Some(&phase1_notifier)) => result, + result = health::wait_for_port(port, &hc, Some(&phase1_notifier)) => result, _ = wait_for_process_exit(pid) => { Err(health::HealthError::PortCheckFailed( "server process exited before binding to port".into(), @@ -1555,11 +1609,11 @@ async fn execute_start_server_isolated( if let Err(e) = phase1_result { let msg = format!("process did not bind to port {port}: {e}"); node_state.status = NodeStatus::Failed; - node_state.health_phases[0].last_error = Some(msg.clone()); + node_state.readiness_phases[0].last_error = Some(msg.clone()); debug_log_free( &ctx.debug_writer, &format!( - "{}:{} — health check phase 1 FAILED: {}", + "{}:{} — readiness phase 1 FAILED: {}", sel.node, sel.variant, msg ), ) @@ -1580,11 +1634,11 @@ async fn execute_start_server_isolated( } let now = chrono::Utc::now(); - node_state.health_phases[0].passed = true; - node_state.health_phases[0].passed_at = Some(now); + node_state.readiness_phases[0].passed = true; + node_state.readiness_phases[0].passed_at = Some(now); emit_progress( &ctx.progress_tx, - ProgressEvent::HealthCheckPassed { + ProgressEvent::ReadinessProbePassed { node: sel.node.clone(), variant: sel.variant.clone(), phase: 1, @@ -1599,13 +1653,13 @@ async fn execute_start_server_isolated( // Phase 2: depends on check type. let phase2_desc = match hc.check_type.as_str() { "http" => format!("HTTP check on port {port}"), - "command" | "bash" => "command health check".to_owned(), + "command" | "bash" => "command readiness check".to_owned(), "port" => "port-only (no phase 2)".to_owned(), other => format!("unknown check type: {other}"), }; emit_progress( &ctx.progress_tx, - ProgressEvent::HealthCheckPhase { + ProgressEvent::ReadinessProbePhase { node: sel.node.clone(), variant: sel.variant.clone(), phase: 2, @@ -1617,14 +1671,14 @@ async fn execute_start_server_isolated( match hc.check_type.as_str() { "http" => { let direct_url = format!("http://127.0.0.1:{port}"); - health::wait_for_http(&direct_url, hc, Some(&phase2_notifier)).await + health::wait_for_http(&direct_url, &hc, Some(&phase2_notifier)).await } "command" | "bash" => { if let Some(cmd) = &hc.command { health::wait_for_command_check( cmd, &working_dir, - hc, + &hc, Some(&phase2_notifier), ) .await @@ -1640,7 +1694,7 @@ async fn execute_start_server_isolated( result = phase2_future => result, _ = wait_for_process_exit(pid) => { Err(health::HealthError::PortCheckFailed( - "server process exited during health check".into(), + "server process exited during readiness check".into(), )) } }; @@ -1648,12 +1702,12 @@ async fn execute_start_server_isolated( match phase2_result { Ok(()) => { let now = chrono::Utc::now(); - node_state.health_phases[1].passed = true; - node_state.health_phases[1].passed_at = Some(now); + node_state.readiness_phases[1].passed = true; + node_state.readiness_phases[1].passed_at = Some(now); node_state.status = NodeStatus::Healthy; emit_progress( &ctx.progress_tx, - ProgressEvent::HealthCheckPassed { + ProgressEvent::ReadinessProbePassed { node: sel.node.clone(), variant: sel.variant.clone(), phase: 2, @@ -1662,7 +1716,7 @@ async fn execute_start_server_isolated( debug_log_free( &ctx.debug_writer, &format!( - "{}:{} — health check passed, node is healthy", + "{}:{} — readiness check passed, node is healthy", sel.node, sel.variant ), ) @@ -1671,11 +1725,11 @@ async fn execute_start_server_isolated( Err(e) => { node_state.status = NodeStatus::Failed; let msg = e.to_string(); - node_state.health_phases[1].last_error = Some(msg.clone()); + node_state.readiness_phases[1].last_error = Some(msg.clone()); debug_log_free( &ctx.debug_writer, &format!( - "{}:{} — health check phase 2 FAILED: {}", + "{}:{} — readiness phase 2 FAILED: {}", sel.node, sel.variant, msg ), ) @@ -1736,16 +1790,17 @@ async fn execute_command_isolated( ); let env = build_env(merged_env.as_ref(), var_ctx)?; - // Verify step (idempotency). - if let Some(ref verify_cmd) = variant_cfg.verify { - let verify_resolved = crate::variables::interpolate(verify_cmd, var_ctx)?; - let verify_result = process::run_command(&verify_resolved, &working_dir, &env, None).await; - if let Ok(ref out) = verify_result { + // Idempotency check (skip_if). + if let Some(ref skip_if_cmd) = variant_cfg.skip_if { + let skip_if_resolved = crate::variables::interpolate(skip_if_cmd, var_ctx)?; + let skip_if_result = + process::run_command(&skip_if_resolved, &working_dir, &env, None).await; + if let Ok(ref out) = skip_if_result { if out.exit_code == 0 { tracing::info!( node = sel.node, variant = sel.variant, - "verify passed — skipping command step" + "skip_if passed — skipping command step" ); node_state.status = NodeStatus::Skipped; node_state @@ -1811,7 +1866,113 @@ async fn execute_command_isolated( } if result.exit_code == 0 { - node_state.status = NodeStatus::Healthy; + // Run readiness probe if configured (probes.readiness on command nodes). + if let Some(hc) = variant_cfg.readiness_probe() { + let hc = hc.clone(); + node_state.status = NodeStatus::HealthChecking; + emit_progress( + &ctx.progress_tx, + ProgressEvent::ReadinessProbePhase { + node: sel.node.clone(), + variant: sel.variant.clone(), + phase: 1, + description: "readiness probe".to_owned(), + }, + ); + + let notifier = make_attempt_notifier(&ctx.progress_tx, &sel.node, &sel.variant, 1); + let probe_result = match hc.check_type.as_str() { + "command" | "bash" => { + if let Some(cmd) = &hc.command { + health::wait_for_command_check(cmd, &working_dir, &hc, Some(¬ifier)) + .await + } else { + Ok(()) + } + } + "port" => { + // Port check — look for a port value in outputs. + // Checks common key names; a future enhancement could add + // an explicit `port_key` field to HealthCheck. + let port_str = node_state + .outputs + .get("PORT") + .or(node_state.outputs.get("DB_PORT")) + .or(node_state.outputs.get("SERVICE_PORT")); + if let Some(port_str) = port_str { + if let Ok(port) = port_str.parse::() { + health::wait_for_port(port, &hc, Some(¬ifier)).await + } else { + tracing::warn!( + node = sel.node, + variant = sel.variant, + "readiness port probe: output value is not a valid port number" + ); + Ok(()) + } + } else { + tracing::warn!( + node = sel.node, + variant = sel.variant, + "readiness port probe skipped: no PORT/DB_PORT/SERVICE_PORT output found" + ); + Ok(()) + } + } + "http" => { + // HTTP check — look for a URL value in outputs. + let url = node_state + .outputs + .get("URL") + .or(node_state.outputs.get("DATABASE_URL")) + .or(node_state.outputs.get("SERVICE_URL")); + if let Some(url) = url { + health::wait_for_http(url, &hc, Some(¬ifier)).await + } else { + tracing::warn!( + node = sel.node, + variant = sel.variant, + "readiness http probe skipped: no URL/DATABASE_URL/SERVICE_URL output found" + ); + Ok(()) + } + } + _ => Ok(()), + }; + + match probe_result { + Ok(()) => { + node_state.status = NodeStatus::Healthy; + emit_progress( + &ctx.progress_tx, + ProgressEvent::ReadinessProbePassed { + node: sel.node.clone(), + variant: sel.variant.clone(), + phase: 1, + }, + ); + } + Err(e) => { + node_state.status = NodeStatus::Failed; + let reason = format!("readiness probe failed: {e}"); + emit_progress( + &ctx.progress_tx, + ProgressEvent::NodeFailed { + node: sel.node.clone(), + variant: sel.variant.clone(), + error: reason.clone(), + }, + ); + return Err(OrchestratorError::NodeFailed { + node: sel.node.clone(), + variant: sel.variant.clone(), + reason, + }); + } + } + } else { + node_state.status = NodeStatus::Healthy; + } } else { node_state.status = NodeStatus::Failed; let reason = format!("command step exited with code {}", result.exit_code); @@ -1838,7 +1999,7 @@ async fn execute_command_isolated( // --------------------------------------------------------------------------- /// Poll until a process is no longer alive. Checks every 250ms. -/// Used to race health checks against premature process death so the +/// Used to race readiness checks against premature process death so the /// orchestrator can fail fast instead of waiting for the full timeout. async fn wait_for_process_exit(pid: u32) { loop { diff --git a/crates/veld-core/src/progress.rs b/crates/veld-core/src/progress.rs index 5ed6272..0ad6724 100644 --- a/crates/veld-core/src/progress.rs +++ b/crates/veld-core/src/progress.rs @@ -25,24 +25,24 @@ pub enum ProgressEvent { port: u16, }, - /// Health check phase started. - HealthCheckPhase { + /// Readiness probe phase started. + ReadinessProbePhase { node: String, variant: String, phase: u8, description: String, }, - /// Health check attempt (retry) within a phase. - HealthCheckAttempt { + /// Readiness probe attempt (retry) within a phase. + ReadinessProbeAttempt { node: String, variant: String, phase: u8, attempt: u32, }, - /// Health check phase passed. - HealthCheckPassed { + /// Readiness probe phase passed. + ReadinessProbePassed { node: String, variant: String, phase: u8, @@ -56,7 +56,7 @@ pub enum ProgressEvent { elapsed_ms: u64, }, - /// Node was skipped (verify command passed). + /// Node was skipped (skip_if command passed). NodeSkipped { node: String, variant: String }, /// Node failed. @@ -92,9 +92,9 @@ pub enum ProgressEvent { /// A teardown step completed. TeardownStepCompleted { name: String }, - /// Service log lines streamed during slow health checks. + /// Service log lines streamed during slow readiness checks. /// - /// Emitted after a delay when health checks are taking longer than + /// Emitted after a delay when readiness checks are taking longer than /// expected, giving the user visibility into what the service is doing. NodeLogLines { node: String, @@ -159,28 +159,28 @@ mod tests { } #[test] - fn test_health_check_phase_serialization() { - let event = ProgressEvent::HealthCheckPhase { + fn test_readiness_probe_phase_serialization() { + let event = ProgressEvent::ReadinessProbePhase { node: "api".into(), variant: "local".into(), phase: 1, description: "waiting for port 8080".into(), }; let json = serde_json::to_string(&event).unwrap(); - assert!(json.contains("\"type\":\"health_check_phase\"")); + assert!(json.contains("\"type\":\"readiness_probe_phase\"")); assert!(json.contains("\"phase\":1")); } #[test] - fn test_health_check_attempt_serialization() { - let event = ProgressEvent::HealthCheckAttempt { + fn test_readiness_probe_attempt_serialization() { + let event = ProgressEvent::ReadinessProbeAttempt { node: "api".into(), variant: "local".into(), phase: 1, attempt: 5, }; let json = serde_json::to_string(&event).unwrap(); - assert!(json.contains("\"type\":\"health_check_attempt\"")); + assert!(json.contains("\"type\":\"readiness_probe_attempt\"")); assert!(json.contains("\"phase\":1")); assert!(json.contains("\"attempt\":5")); } diff --git a/crates/veld-core/src/state.rs b/crates/veld-core/src/state.rs index 0742ffd..ee97eb7 100644 --- a/crates/veld-core/src/state.rs +++ b/crates/veld-core/src/state.rs @@ -44,6 +44,8 @@ pub enum StateError { pub enum RunStatus { Starting, Running, + /// A recovery cycle is in progress for one or more nodes. + Recovering, Stopping, Stopped, Failed, @@ -60,17 +62,19 @@ pub enum NodeStatus { Starting, HealthChecking, Healthy, + /// Liveness probe failed but recovery has not yet been exhausted. + Unhealthy, Failed, Stopped, Skipped, } // --------------------------------------------------------------------------- -// Health check phase tracking +// Readiness phase tracking // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HealthCheckPhase { +pub struct ReadinessPhase { pub phase: u8, // 1 = port, 2 = HTTPS pub passed: bool, pub last_error: Option, @@ -91,7 +95,18 @@ pub struct NodeState { pub port: Option, pub url: Option, pub outputs: HashMap, - pub health_phases: Vec, + /// Readiness probe phase tracking (renamed from `health_phases` in v7). + #[serde(default, alias = "health_phases")] + pub readiness_phases: Vec, + /// Number of recovery attempts completed for this node. + #[serde(default)] + pub recovery_count: u32, + /// Current streak of consecutive liveness probe failures. + #[serde(default)] + pub consecutive_failures: u32, + /// Error message from the most recent liveness probe failure. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_liveness_error: Option, /// Output keys whose values are sensitive (encrypted at rest, masked in display). #[serde(default, skip_serializing_if = "Vec::is_empty")] pub sensitive_keys: Vec, @@ -107,7 +122,10 @@ impl NodeState { port: None, url: None, outputs: HashMap::new(), - health_phases: Vec::new(), + readiness_phases: Vec::new(), + recovery_count: 0, + consecutive_failures: 0, + last_liveness_error: None, sensitive_keys: Vec::new(), } } diff --git a/crates/veld-daemon/Cargo.toml b/crates/veld-daemon/Cargo.toml index 209a2d3..4419129 100644 --- a/crates/veld-daemon/Cargo.toml +++ b/crates/veld-daemon/Cargo.toml @@ -22,4 +22,5 @@ chrono = { workspace = true } dirs = { workspace = true } libc = "0.2" axum = "0.8" +reqwest = { workspace = true } uuid = { workspace = true } diff --git a/crates/veld-daemon/assets/management-ui.html b/crates/veld-daemon/assets/management-ui.html index fc2e67e..729f989 100644 --- a/crates/veld-daemon/assets/management-ui.html +++ b/crates/veld-daemon/assets/management-ui.html @@ -49,6 +49,7 @@ .badge.stopped{background:rgba(85,88,112,.15);color:var(--dim)} .badge.starting,.badge.stopping,.badge.health_checking,.badge.pending{background:var(--yellow-bg);color:var(--yellow)} .badge.failed{background:var(--red-bg);color:var(--red)} +.badge.recovering,.badge.unhealthy{background:var(--yellow-bg,#3d3800);color:var(--yellow)} .badge.skipped{background:rgba(85,88,112,.15);color:var(--dim)} .card-sub{display:flex;align-items:center;gap:8px;padding:0 18px 12px;font-size:12px;color:var(--text2)} @@ -72,7 +73,11 @@ .health-dot.healthy,.health-dot.running{background:var(--green)} .health-dot.starting,.health-dot.health_checking,.health-dot.pending{background:var(--yellow)} .health-dot.failed{background:var(--red)} +.health-dot.unhealthy{background:var(--yellow)} .health-dot.stopped,.health-dot.skipped{background:var(--dim)} +.svc-liveness{font-size:11px;color:var(--dim);padding-left:20px} +.svc-liveness .warn{color:var(--yellow)} +.svc-liveness .err{color:var(--red)} /* services table (merged URLs + debug) */ .svc-table{width:100%;font-size:12px;border-collapse:collapse} @@ -238,6 +243,15 @@ h+=''; h+=''+esc(n.variant)+''; h+=''+(n.pid||'')+''; + if(n.recovery_count||n.consecutive_failures||n.last_liveness_error){ + h+=''; + var parts=[]; + if(n.consecutive_failures)parts.push('failures: '+n.consecutive_failures+''); + if(n.recovery_count)parts.push('recoveries: '+n.recovery_count); + if(n.last_liveness_error)parts.push(''+esc(n.last_liveness_error)+''); + h+=parts.join(' · '); + h+=''; + } } h+=''; }else{h+='
No services
';} @@ -271,6 +285,7 @@ ''+ ''+ ''+ + ''+ ''+ ''+ ''+ diff --git a/crates/veld-daemon/src/management.rs b/crates/veld-daemon/src/management.rs index 4cb46a9..b7b7e64 100644 --- a/crates/veld-daemon/src/management.rs +++ b/crates/veld-daemon/src/management.rs @@ -71,6 +71,16 @@ struct NodeInfo { status: NodeStatus, url: Option, pid: Option, + #[serde(skip_serializing_if = "is_zero")] + recovery_count: u32, + #[serde(skip_serializing_if = "is_zero")] + consecutive_failures: u32, + #[serde(skip_serializing_if = "Option::is_none")] + last_liveness_error: Option, +} + +fn is_zero(v: &u32) -> bool { + *v == 0 } async fn list_environments() -> Result, StatusCode> { @@ -102,6 +112,9 @@ async fn list_environments() -> Result, StatusCode> { status: ns.status.clone(), url: ns.url.clone(), pid: ns.pid, + recovery_count: ns.recovery_count, + consecutive_failures: ns.consecutive_failures, + last_liveness_error: ns.last_liveness_error.clone(), }) .collect() }) @@ -200,8 +213,30 @@ async fn get_logs( let lines_limit = q.lines.clamp(1, 5000); let include_server = q.source == "all" || q.source == "server"; let include_client = q.source == "all" || q.source == "client"; + let include_internal = q.source == "all" || q.source == "internal" || q.source == "veld"; let mut nodes = Vec::new(); + // Internal (veld daemon) log — not per-node, shown as _veld:internal. + if include_internal { + let log_path = logging::internal_log_file(&project_root, &run_name); + let lines = if log_path.exists() { + let raw = logging::tail_lines(&log_path, lines_limit) + .await + .unwrap_or_default(); + logging::merge_continuation_lines(raw) + } else { + Vec::new() + }; + if !lines.is_empty() { + nodes.push(NodeLogs { + node: "_veld".to_owned(), + variant: "internal".to_owned(), + source: "internal".to_owned(), + lines, + }); + } + } + for ns in run_state.nodes.values() { if let Some(ref filter) = q.node { if ns.node_name != *filter { diff --git a/crates/veld-daemon/src/monitor.rs b/crates/veld-daemon/src/monitor.rs index 500563a..8d0a27b 100644 --- a/crates/veld-daemon/src/monitor.rs +++ b/crates/veld-daemon/src/monitor.rs @@ -1,20 +1,78 @@ use crate::broadcaster::Broadcaster; +use std::collections::HashMap; +use std::path::Path; +use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; -use veld_core::state::{GlobalRegistry, ProjectState, RunStatus}; +use veld_core::config::{self, LivenessProbe, VeldConfig}; +use veld_core::logging::{self, LogWriter}; +use veld_core::state::{GlobalRegistry, NodeStatus, ProjectState, RunStatus}; /// Interval between health-check scans (seconds). const SCAN_INTERVAL_SECS: u64 = 5; +/// Tracks when each node's liveness probe was last executed. +/// Key: `"project_root:run_name:node:variant"`. +type LastCheckMap = HashMap; + +/// Resolve the user's full PATH by spawning an interactive login shell. +/// Falls back to the current process PATH if resolution fails. +fn resolve_user_path() -> String { + let shell = std::env::var("SHELL").unwrap_or_else(|_| "sh".to_owned()); + // Use -l -i -c to get a fully initialized interactive login shell. + // This captures PATH after .zprofile/.bash_profile/brew shellenv etc. + let output = std::process::Command::new(&shell) + .arg("-l") + .arg("-i") + .arg("-c") + .arg("echo $PATH") + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::null()) + .output(); + + match output { + Ok(o) if o.status.success() => { + let path = String::from_utf8_lossy(&o.stdout).trim().to_owned(); + if !path.is_empty() { + info!(path = %path, "resolved user PATH from login shell"); + return path; + } + } + Ok(o) => { + debug!( + exit_code = o.status.code(), + "login shell PATH resolution exited non-zero, using fallback" + ); + } + Err(e) => { + debug!(error = %e, "failed to resolve user PATH, using fallback"); + } + } + + std::env::var("PATH").unwrap_or_default() +} + /// Periodically scan all runs from the global registry and check process health. /// When a status change is detected, update the registry and broadcast the event. pub async fn run_health_monitor(broadcaster: Broadcaster) { let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(SCAN_INTERVAL_SECS)); + let mut last_checks: LastCheckMap = HashMap::new(); + + // Resolve the user's full PATH once at startup so probe commands can + // find tools like pg_isready even when the daemon starts at boot. + let mut user_path = resolve_user_path(); + let mut path_resolved_at = Instant::now(); loop { interval.tick().await; debug!("running health-check scan"); - match scan_and_update(&broadcaster).await { + // Re-resolve PATH every 60s to pick up changes after user login. + if path_resolved_at.elapsed() > Duration::from_secs(60) { + user_path = resolve_user_path(); + path_resolved_at = Instant::now(); + } + + match scan_and_update(&broadcaster, &mut last_checks, &user_path).await { Ok(changes) => { if changes > 0 { info!("health scan detected {changes} status change(s)"); @@ -29,7 +87,11 @@ pub async fn run_health_monitor(broadcaster: Broadcaster) { /// Scan the global registry, check each running process, and return the number /// of status changes applied. -async fn scan_and_update(broadcaster: &Broadcaster) -> anyhow::Result { +async fn scan_and_update( + broadcaster: &Broadcaster, + last_checks: &mut LastCheckMap, + user_path: &str, +) -> anyhow::Result { let registry = GlobalRegistry::load()?; let mut changes = 0; @@ -120,15 +182,547 @@ async fn scan_and_update(broadcaster: &Broadcaster) -> anyhow::Result { broadcaster.broadcast(&event).await; changes += 1; + continue; // Skip liveness checks for a run that just stopped. } + + // --- Liveness probe checks --- + // Load the project config to access probe definitions. + let config = match load_config_for_project(project_root) { + Some(c) => c, + None => continue, + }; + + // Create internal log writer for this run. + let log_path = logging::internal_log_file(project_root, run_name); + let internal_log = LogWriter::new(log_path).await.ok(); + + changes += run_liveness_checks( + project_root, + run_name, + &config, + broadcaster, + last_checks, + internal_log.as_ref(), + user_path, + ) + .await; } } Ok(changes) } +/// Run liveness probes for all healthy nodes in a run. Returns number of state changes. +async fn run_liveness_checks( + project_root: &Path, + run_name: &str, + config: &VeldConfig, + broadcaster: &Broadcaster, + last_checks: &mut LastCheckMap, + internal_log: Option<&LogWriter>, + user_path: &str, +) -> usize { + // Reload state fresh for liveness checks. + let mut project_state = match ProjectState::load(project_root) { + Ok(ps) => ps, + Err(_) => return 0, + }; + + let run = match project_state.get_run_mut(run_name) { + Some(r) => r, + None => return 0, + }; + + let mut changes = 0; + + // Collect nodes to check — both Healthy and Unhealthy nodes get probed. + // Unhealthy nodes can recover if probes start passing again. + let nodes_to_check: Vec<(String, String, String)> = run + .nodes + .iter() + .filter(|(_, ns)| ns.status == NodeStatus::Healthy || ns.status == NodeStatus::Unhealthy) + .map(|(key, ns)| (key.clone(), ns.node_name.clone(), ns.variant.clone())) + .collect(); + + for (key, node_name, variant_name) in &nodes_to_check { + let node_cfg = match config.nodes.get(node_name) { + Some(c) => c, + None => continue, + }; + let variant_cfg = match node_cfg.variants.get(variant_name) { + Some(c) => c, + None => continue, + }; + + let liveness = match variant_cfg.liveness_probe() { + Some(lp) => lp, + None => continue, + }; + + // Respect per-probe interval_ms — skip if not enough time has elapsed. + let check_key = format!("{}:{}:{}", project_root.to_string_lossy(), run_name, key); + let probe_interval = Duration::from_millis(liveness.interval_ms); + if let Some(last) = last_checks.get(&check_key) { + if last.elapsed() < probe_interval { + continue; + } + } + last_checks.insert(check_key, Instant::now()); + + // Run a single liveness check attempt. + let working_dir = config::resolve_cwd( + project_root, + node_cfg.cwd.as_deref(), + variant_cfg.cwd.as_deref(), + ); + + let node_label = format!("{node_name}:{variant_name}"); + + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[liveness] {node_label} — running probe (type: {})", + liveness.check_type + )) + .await; + } + + let check_result = + run_single_liveness_check(liveness, &working_dir, run, key, user_path).await; + + let node_state = match run.nodes.get_mut(key) { + Some(ns) => ns, + None => continue, + }; + + match check_result { + Ok(()) => { + if let Some(log) = internal_log { + let _ = log + .write_line(&format!("[liveness] {node_label} — probe passed")) + .await; + } + // Reset failure counter on success. + if node_state.consecutive_failures > 0 || node_state.status == NodeStatus::Unhealthy + { + node_state.consecutive_failures = 0; + node_state.last_liveness_error = None; + // Transition Unhealthy -> Healthy (probe started passing again). + if node_state.status == NodeStatus::Unhealthy { + node_state.status = NodeStatus::Healthy; + info!( + node = node_name.as_str(), + variant = variant_name.as_str(), + "node self-healed — transitioning from unhealthy to healthy" + ); + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[liveness] {node_label} — self-healed, back to healthy" + )) + .await; + } + } + changes += 1; + } + } + Err(error_detail) => { + node_state.consecutive_failures += 1; + node_state.last_liveness_error = Some(error_detail.clone()); + changes += 1; + + info!( + node = node_name.as_str(), + variant = variant_name.as_str(), + consecutive_failures = node_state.consecutive_failures, + threshold = liveness.failure_threshold, + "liveness probe failed" + ); + + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[liveness] {node_label} — probe failed ({}/{} consecutive): {error_detail}", + node_state.consecutive_failures, liveness.failure_threshold + )) + .await; + } + + // Check if failure threshold is reached. + if node_state.consecutive_failures >= liveness.failure_threshold { + if node_state.recovery_count >= liveness.max_recoveries { + // Exhausted — permanently fail. + node_state.status = NodeStatus::Failed; + warn!( + node = node_name.as_str(), + variant = variant_name.as_str(), + max_recoveries = liveness.max_recoveries, + "recovery exhausted — node permanently failed" + ); + + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[recovery] {node_label} — permanently failed after {} recovery attempts", + liveness.max_recoveries + )) + .await; + } + + let event = serde_json::json!({ + "event": "recovery_exhausted", + "run": run_name, + "project": project_root.to_string_lossy(), + "node": node_name, + "variant": variant_name, + "max_recoveries": liveness.max_recoveries, + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + broadcaster.broadcast(&event).await; + } else { + // Trigger restart. + let new_recovery_count = node_state.recovery_count + 1; + + info!( + node = node_name.as_str(), + variant = variant_name.as_str(), + attempt = new_recovery_count, + max = liveness.max_recoveries, + "triggering recovery restart" + ); + + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[recovery] {node_label} — restarting environment (attempt {new_recovery_count}/{})", + liveness.max_recoveries + )) + .await; + } + + let event = serde_json::json!({ + "event": "recovery_starting", + "run": run_name, + "project": project_root.to_string_lossy(), + "node": node_name, + "variant": variant_name, + "attempt": new_recovery_count, + "max_recoveries": liveness.max_recoveries, + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + broadcaster.broadcast(&event).await; + + // Save state BEFORE restart so recovery_count is persisted. + // Don't set status to Unhealthy — the restart will create + // fresh Healthy state. We only need recovery_count to survive. + node_state.recovery_count = new_recovery_count; + node_state.consecutive_failures = 0; + let _ = project_state.save(project_root); + + // Run the restart. This stops+starts the entire environment, + // creating fresh node state with recovery_count: 0. + run_veld_restart(project_root, run_name, internal_log, user_path).await; + + // Restore recovery_count on the fresh state so it accumulates + // across restarts and eventually hits max_recoveries. + if let Ok(mut fresh_state) = ProjectState::load(project_root) { + if let Some(fresh_run) = fresh_state.get_run_mut(run_name) { + if let Some(fresh_node) = fresh_run.nodes.get_mut(key) { + fresh_node.recovery_count = new_recovery_count; + } + } + let _ = fresh_state.save(project_root); + } + + // Return early — don't save stale in-memory state over + // the fresh state created by the restart. + return changes; + } + } + } + } + } + + // Persist any state changes (failure counts, etc.). + if changes > 0 { + let _ = project_state.save(project_root); + } + + changes +} + +/// Run a single liveness check for a node. +/// Returns `Ok(())` if healthy, `Err(reason)` with details if unhealthy. +async fn run_single_liveness_check( + liveness: &LivenessProbe, + working_dir: &Path, + run: &veld_core::state::RunState, + node_key: &str, + user_path: &str, +) -> Result<(), String> { + let node_state = match run.nodes.get(node_key) { + Some(ns) => ns, + None => return Ok(()), + }; + + match liveness.check_type.as_str() { + "command" | "bash" => { + if let Some(ref cmd) = liveness.command { + // Timeout command checks to prevent hanging the monitor loop. + // Inject the resolved user PATH so probes find tools like + // pg_isready even when the daemon starts at boot. + let result = tokio::time::timeout(Duration::from_secs(30), async { + let mut command = tokio::process::Command::new("sh"); + command + .arg("-c") + .arg(cmd) + .current_dir(working_dir) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .env("PATH", user_path); + // Inject node outputs as environment variables so probe + // commands can reference them (e.g., pg_isready -h $DB_HOST). + for (key, value) in &node_state.outputs { + command.env(key, value); + } + command.output().await + }) + .await; + + match result { + Ok(Ok(output)) if output.status.success() => Ok(()), + Ok(Ok(output)) => { + let stderr = String::from_utf8_lossy(&output.stderr); + let stderr = stderr.trim(); + let code = output.status.code().unwrap_or(-1); + if stderr.is_empty() { + Err(format!("exit code {code}")) + } else { + Err(format!("exit code {code}: {stderr}")) + } + } + Ok(Err(e)) => Err(format!("exec error: {e}")), + Err(_) => Err("command timed out (30s)".to_owned()), + } + } else { + Ok(()) // No command configured, consider healthy. + } + } + "port" => { + if let Some(port) = node_state.port { + let addr: std::net::SocketAddr = ([127, 0, 0, 1], port).into(); + match tokio::time::timeout( + Duration::from_secs(5), + tokio::net::TcpStream::connect(addr), + ) + .await + { + Ok(Ok(_)) => Ok(()), + Ok(Err(e)) => Err(format!("port {port} connection failed: {e}")), + Err(_) => Err(format!("port {port} connection timed out")), + } + } else { + Ok(()) // No port known, skip. + } + } + "http" => { + if let Some(port) = node_state.port { + let path = liveness.path.as_deref().unwrap_or("/"); + let path = if path.starts_with('/') { + path.to_owned() + } else { + format!("/{path}") + }; + let url = format!("http://127.0.0.1:{port}{path}"); + let expected = liveness.expect_status.unwrap_or(200); + + let client = match reqwest::Client::builder() + .timeout(Duration::from_secs(5)) + .build() + { + Ok(c) => c, + Err(e) => return Err(format!("http client error: {e}")), + }; + + match client.get(&url).send().await { + Ok(resp) => { + let status = resp.status().as_u16(); + if status == expected { + Ok(()) + } else { + Err(format!("http status {status} (expected {expected})")) + } + } + Err(e) => Err(format!("http request failed: {e}")), + } + } else { + Ok(()) // No port known, skip. + } + } + other => { + warn!( + check_type = other, + "unknown liveness probe type — treating as healthy" + ); + Ok(()) + } + } +} + +/// Load the VeldConfig for a project root, if a veld.json exists. +fn load_config_for_project(project_root: &Path) -> Option { + let config_path = project_root.join("veld.json"); + if !config_path.exists() { + return None; + } + config::load_config(&config_path).ok() +} + +/// Find the veld CLI binary path. +/// Checks: next to daemon binary, `~/.local/bin/veld`, then falls back to PATH. +fn find_veld_binary() -> std::path::PathBuf { + // 1. Same directory as daemon binary. + if let Some(sibling) = std::env::current_exe() + .ok() + .and_then(|p| p.parent().map(|d| d.join("veld"))) + .filter(|p| p.exists()) + { + return sibling; + } + + // 2. Standard user install location. + if let Some(home) = dirs::home_dir() { + let user_bin = home.join(".local/bin/veld"); + if user_bin.exists() { + return user_bin; + } + } + + // 3. System paths. + for path in ["/usr/local/bin/veld", "/usr/bin/veld"] { + let p = std::path::PathBuf::from(path); + if p.exists() { + return p; + } + } + + // 4. Fall back to PATH lookup. + std::path::PathBuf::from("veld") +} + +/// Run `veld restart --name ` and wait for completion. +/// Captures stdout/stderr and logs the result. +async fn run_veld_restart( + project_root: &Path, + run_name: &str, + internal_log: Option<&LogWriter>, + user_path: &str, +) { + let veld_bin = find_veld_binary(); + + info!( + run = run_name, + bin = %veld_bin.display(), + "running veld restart" + ); + + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[recovery] running: {} restart --name {}", + veld_bin.display(), + run_name + )) + .await; + } + + let result = tokio::time::timeout( + Duration::from_secs(300), // 5 min timeout for full restart + tokio::process::Command::new(&veld_bin) + .arg("restart") + .arg("--name") + .arg(run_name) + .current_dir(project_root) + .env("PATH", user_path) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .output(), + ) + .await; + + match result { + Ok(Ok(output)) => { + let code = output.status.code().unwrap_or(-1); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + + if output.status.success() { + info!(run = run_name, "veld restart completed successfully"); + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[recovery] veld restart completed (exit code {code})" + )) + .await; + if !stdout.trim().is_empty() { + for line in stdout.trim().lines() { + let _ = log.write_line(&format!("[recovery] {line}")).await; + } + } + } + } else { + warn!(run = run_name, exit_code = code, "veld restart failed"); + if let Some(log) = internal_log { + let _ = log + .write_line(&format!( + "[recovery] veld restart FAILED (exit code {code})" + )) + .await; + if !stdout.trim().is_empty() { + for line in stdout.trim().lines() { + let _ = log + .write_line(&format!("[recovery] stdout: {line}")) + .await; + } + } + if !stderr.trim().is_empty() { + for line in stderr.trim().lines() { + let _ = log + .write_line(&format!("[recovery] stderr: {line}")) + .await; + } + } + } + } + } + Ok(Err(e)) => { + warn!( + run = run_name, + bin = %veld_bin.display(), + error = %e, + "failed to execute veld restart" + ); + if let Some(log) = internal_log { + let _ = log + .write_line(&format!("[recovery] failed to execute veld restart: {e}")) + .await; + } + } + Err(_) => { + warn!(run = run_name, "veld restart timed out (300s)"); + if let Some(log) = internal_log { + let _ = log + .write_line("[recovery] veld restart timed out (300s)") + .await; + } + } + } +} + /// Check whether a given PID is alive by sending signal 0. fn is_process_alive(pid: u32) -> bool { - // On Unix, sending signal 0 checks for process existence. + let Some(pid) = i32::try_from(pid).ok().filter(|&p| p > 0) else { + return false; + }; unsafe { libc::kill(pid as libc::pid_t, 0) == 0 } } diff --git a/crates/veld/src/commands/init.rs b/crates/veld/src/commands/init.rs index a4cb865..d1e5707 100644 --- a/crates/veld/src/commands/init.rs +++ b/crates/veld/src/commands/init.rs @@ -9,8 +9,8 @@ use crate::output; // --------------------------------------------------------------------------- const INIT_TEMPLATE: &str = r#"{ - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "my-project", "url_template": "{service}.{run}.{project}.localhost", "presets": { @@ -510,7 +510,7 @@ fn generate_veld_json( ) -> String { let mut json = String::new(); json.push_str("{\n"); - json.push_str(" \"$schema\": \"https://veld.oss.life.li/schema/v1/veld.schema.json\",\n"); + json.push_str(" \"$schema\": \"https://veld.oss.life.li/schema/v2/veld.schema.json\",\n"); json.push_str(" \"schemaVersion\": \"1\",\n"); json.push_str(&format!(" \"name\": \"{}\",\n", escape_json(project_name))); json.push_str(&format!( @@ -563,7 +563,7 @@ fn generate_veld_json( " \"command\": \"{}\",\n", escape_json(command) )); - node.push_str(" \"health_check\": { \"type\": \"port\" }"); + node.push_str(" \"probes\": { \"readiness\": { \"type\": \"port\" } }"); // Add depends_on if any let service_deps: Vec<&String> = deps @@ -853,8 +853,8 @@ pub async fn run() -> i32 { // No services detected/selected: write basic template with project name format!( r#"{{ - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "{}", "url_template": "{}", "presets": {{ diff --git a/crates/veld/src/commands/logs.rs b/crates/veld/src/commands/logs.rs index e27e6ab..643f95f 100644 --- a/crates/veld/src/commands/logs.rs +++ b/crates/veld/src/commands/logs.rs @@ -15,6 +15,7 @@ pub enum SourceFilter { All, Server, Client, + Internal, } impl SourceFilter { @@ -23,6 +24,7 @@ impl SourceFilter { "all" => Some(Self::All), "server" => Some(Self::Server), "client" => Some(Self::Client), + "internal" | "veld" => Some(Self::Internal), _ => None, } } @@ -106,22 +108,31 @@ pub async fn run(opts: LogsOptions) -> i32 { // Build list of (path, node, variant, source_label) for each log file to read. let mut log_sources: Vec<(PathBuf, &str, &str, &str)> = Vec::new(); - for (node_name, variant) in &targets { - if source != SourceFilter::Client { - log_sources.push(( - logging::log_file(&project_root, run_name, node_name, variant), - node_name, - variant, - "server", - )); - } - if source != SourceFilter::Server { - log_sources.push(( - logging::client_log_file(&project_root, run_name, node_name, variant), - node_name, - variant, - "client", - )); + + // Internal (veld daemon) log — not per-node, shown when source is "all" or "internal". + if source == SourceFilter::All || source == SourceFilter::Internal { + let internal_path = logging::internal_log_file(&project_root, run_name); + log_sources.push((internal_path, "_veld", "internal", "internal")); + } + + if source != SourceFilter::Internal { + for (node_name, variant) in &targets { + if source != SourceFilter::Client { + log_sources.push(( + logging::log_file(&project_root, run_name, node_name, variant), + node_name, + variant, + "server", + )); + } + if source != SourceFilter::Server { + log_sources.push(( + logging::client_log_file(&project_root, run_name, node_name, variant), + node_name, + variant, + "client", + )); + } } } @@ -299,22 +310,35 @@ async fn follow_logs( // Build list of (path, node, variant, source_label) to follow. let mut follow_sources: Vec<(PathBuf, String, String, String)> = Vec::new(); - for (node_name, variant) in targets { - if source != SourceFilter::Client { - follow_sources.push(( - logging::log_file(project_root, run_name, node_name, variant), - node_name.to_string(), - variant.to_string(), - "server".to_string(), - )); - } - if source != SourceFilter::Server { - follow_sources.push(( - logging::client_log_file(project_root, run_name, node_name, variant), - node_name.to_string(), - variant.to_string(), - "client".to_string(), - )); + + // Internal log. + if source == SourceFilter::All || source == SourceFilter::Internal { + follow_sources.push(( + logging::internal_log_file(project_root, run_name), + "_veld".to_string(), + "internal".to_string(), + "internal".to_string(), + )); + } + + if source != SourceFilter::Internal { + for (node_name, variant) in targets { + if source != SourceFilter::Client { + follow_sources.push(( + logging::log_file(project_root, run_name, node_name, variant), + node_name.to_string(), + variant.to_string(), + "server".to_string(), + )); + } + if source != SourceFilter::Server { + follow_sources.push(( + logging::client_log_file(project_root, run_name, node_name, variant), + node_name.to_string(), + variant.to_string(), + "client".to_string(), + )); + } } } diff --git a/crates/veld/src/commands/start.rs b/crates/veld/src/commands/start.rs index 8728ae7..65c4142 100644 --- a/crates/veld/src/commands/start.rs +++ b/crates/veld/src/commands/start.rs @@ -420,7 +420,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) { state.redraw(ctx.total, "starting..."); } } - ProgressEvent::HealthCheckPhase { + ProgressEvent::ReadinessProbePhase { node, variant, phase, @@ -433,7 +433,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) { state.redraw(ctx.total, ""); } } - ProgressEvent::HealthCheckAttempt { + ProgressEvent::ReadinessProbeAttempt { node, variant, phase: _, @@ -444,7 +444,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) { state.redraw(ctx.total, &format!("attempt {attempt}")); } } - ProgressEvent::HealthCheckPassed { + ProgressEvent::ReadinessProbePassed { node: _, variant: _, phase: _, @@ -479,7 +479,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) { " {} {} {}", output::dim("~"), output::pad_right(&key, 30), - output::dim("skipped (verify passed)"), + output::dim("skipped (skip_if passed)"), ); if let Some(state) = ctx.bars.remove(&key) { state.bar.finish_with_message(finish_msg); diff --git a/crates/veld/src/commands/status.rs b/crates/veld/src/commands/status.rs index 3a4548f..0f9a1f5 100644 --- a/crates/veld/src/commands/status.rs +++ b/crates/veld/src/commands/status.rs @@ -105,6 +105,46 @@ pub async fn run(name: Option, show_outputs: bool, json: bool) -> i32 { output::print_table(&["NODE", "VARIANT", "STATUS", "URL"], &rows); + // Show liveness/recovery details for nodes that have them. + let has_liveness_info = run_state.nodes.values().any(|ns| { + ns.recovery_count > 0 + || ns.consecutive_failures > 0 + || ns.last_liveness_error.is_some() + || ns.status == NodeStatus::Unhealthy + }); + if has_liveness_info { + println!(); + println!("{}", output::bold("Liveness:")); + for key in &node_keys { + let ns = &run_state.nodes[*key]; + if ns.recovery_count == 0 + && ns.consecutive_failures == 0 + && ns.last_liveness_error.is_none() + && ns.status != NodeStatus::Unhealthy + { + continue; + } + println!(); + println!( + " {}", + output::cyan(&format!("{}:{}", ns.node_name, ns.variant)) + ); + if ns.consecutive_failures > 0 { + println!( + " {} consecutive failures: {}", + output::yellow("!"), + ns.consecutive_failures + ); + } + if ns.recovery_count > 0 { + println!(" {} recoveries: {}", output::dim("↻"), ns.recovery_count); + } + if let Some(ref err) = ns.last_liveness_error { + println!(" {} last error: {}", output::dim("→"), err); + } + } + } + // Show outputs per node when --outputs is passed. if show_outputs { println!(); @@ -174,6 +214,7 @@ fn format_run_status(status: &RunStatus) -> String { RunStatus::Stopping => output::yellow("stopping"), RunStatus::Stopped => output::dim("stopped"), RunStatus::Failed => output::red("failed"), + RunStatus::Recovering => output::yellow("recovering"), } } @@ -192,5 +233,6 @@ fn format_node_status(status: &NodeStatus) -> String { NodeStatus::Stopped => format!("{} {}", output::dim("-"), output::dim("stopped")), NodeStatus::Failed => format!("{} {}", output::cross(), output::red("failed")), NodeStatus::Skipped => format!("{} {}", output::dim("-"), output::dim("skipped")), + NodeStatus::Unhealthy => format!("{} {}", output::cross(), output::yellow("unhealthy")), } } diff --git a/crates/veld/src/commands/update.rs b/crates/veld/src/commands/update.rs index 6fb0b4c..1ecebf5 100644 --- a/crates/veld/src/commands/update.rs +++ b/crates/veld/src/commands/update.rs @@ -1,3 +1,9 @@ +use std::io::Write; + +use veld_core::config; +use veld_core::orchestrator::Orchestrator; +use veld_core::state::{GlobalRegistry, ProjectState, RunStatus}; + use crate::output; /// `veld update` -- update Veld to the latest version. @@ -8,6 +14,42 @@ pub async fn run() -> i32 { match veld_core::setup::check_update().await { Ok(Some(new_version)) => { + // Check for running environments and stop them before updating. + let running = find_running_environments(); + if !running.is_empty() { + println!(); + output::print_info(&format!( + "Found {} running environment(s) that must be stopped before updating:", + running.len() + )); + for (project, run_name) in &running { + println!( + " {} {}", + output::cyan(run_name), + output::dim(&format!("({})", project.display())) + ); + } + println!(); + print!( + "{}", + output::yellow("Stop all environments and proceed with update? [y/N] ") + ); + let _ = std::io::stdout().flush(); + + let mut answer = String::new(); + if std::io::stdin().read_line(&mut answer).is_err() + || !answer.trim().eq_ignore_ascii_case("y") + { + output::print_info("Update cancelled."); + return 0; + } + + // Stop all running environments. + let stopped = stop_all_environments(&running).await; + output::print_success(&format!("Stopped {stopped} environment(s).")); + println!(); + } + output::print_info(&format!("New version available: {current} → {new_version}")); output::print_info("Installing update..."); @@ -37,6 +79,65 @@ pub async fn run() -> i32 { } } +/// Find all running environments across all projects. +/// Returns (project_root, run_name) pairs. +fn find_running_environments() -> Vec<(std::path::PathBuf, String)> { + let registry = match GlobalRegistry::load() { + Ok(r) => r, + Err(_) => return Vec::new(), + }; + + let mut running = Vec::new(); + for entry in registry.projects.values() { + for (run_name, run_info) in &entry.runs { + if run_info.status == RunStatus::Running { + running.push((entry.project_root.clone(), run_name.clone())); + } + } + } + running +} + +/// Stop all running environments. Returns number successfully stopped. +async fn stop_all_environments(envs: &[(std::path::PathBuf, String)]) -> usize { + let mut stopped = 0; + for (project_root, run_name) in envs { + let config_path = project_root.join("veld.json"); + let cfg = match config::load_config(&config_path) { + Ok(c) => c, + Err(e) => { + output::print_error( + &format!("Failed to load config for {}: {e}", project_root.display()), + false, + ); + // Even if config can't load, try to clean up state. + cleanup_state(project_root, run_name); + continue; + } + }; + + let mut orchestrator = Orchestrator::new(config_path, cfg); + match orchestrator.stop(run_name).await { + Ok(_) => { + output::print_info(&format!(" Stopped '{run_name}'")); + stopped += 1; + } + Err(e) => { + output::print_error(&format!(" Failed to stop '{run_name}': {e}"), false); + } + } + } + stopped +} + +/// Best-effort cleanup of state for a run when config can't be loaded. +fn cleanup_state(project_root: &std::path::Path, run_name: &str) { + if let Ok(mut state) = ProjectState::load(project_root) { + state.runs.remove(run_name); + let _ = state.save(project_root); + } +} + /// Re-install the Hammerspoon Spoon if it was previously set up. /// The Spoon files are embedded in the binary, so they need to be re-extracted /// after every CLI update to pick up any changes. diff --git a/crates/veld/src/main.rs b/crates/veld/src/main.rs index 3058786..07ff954 100644 --- a/crates/veld/src/main.rs +++ b/crates/veld/src/main.rs @@ -156,7 +156,7 @@ enum Command { #[arg(long)] json: bool, - /// Filter by log source: all, server, or client. + /// Filter by log source: all, server, client, or internal (veld daemon liveness/recovery logs). #[arg(long, default_value = "all")] source: String, @@ -351,7 +351,9 @@ async fn main() { let source_filter = commands::logs::SourceFilter::from_str(&source).unwrap_or_else(|| { output::print_error( - &format!("Invalid --source value '{source}'. Use: all, server, client"), + &format!( + "Invalid --source value '{source}'. Use: all, server, client, internal" + ), json, ); std::process::exit(1); diff --git a/docs/configuration.md b/docs/configuration.md index 05f5316..5b3b989 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -12,8 +12,8 @@ All relative paths in the configuration resolve relative to the directory contai ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "my-app", "nodes": { "backend": { @@ -34,7 +34,7 @@ All relative paths in the configuration resolve relative to the directory contai | Field | Type | Required | Description | |------------------|--------|----------|--------------------------------------------------| | `$schema` | string | No | JSON Schema URL for editor autocompletion | -| `schemaVersion` | string | Yes | Must be `"1"` for the current version | +| `schemaVersion` | string | Yes | `"1"` or `"2"`. Use `"2"` for new projects. | | `name` | string | Yes | Human-readable project name | | `url_template` | string | No | URL template for services (see [URL Templates]) | | `presets` | object | No | Named shortcuts for node:variant selections | @@ -67,10 +67,10 @@ The name is available as the `{project}` variable in URL templates and as `${vel ### `schemaVersion` -Must be `"1"`. Veld validates this on every command and exits with a clear error if it encounters an unknown version. +Must be `"1"` or `"2"`. Use `"2"` for new projects — it enables `probes` and `skip_if`. Version `"1"` is still fully supported (uses legacy `health_check` and `verify` fields). ```json -"schemaVersion": "1" +"schemaVersion": "2" ``` ### `url_template` @@ -322,14 +322,15 @@ A variant defines how a node behaves in a given context. The same node might be | `type` | string | Yes | All | `"command"` or `"start_server"` | | `command` | string | Varies | All | Inline shell command to execute | | `script` | string | Varies | `command` only | Path to script file, relative to `veld.json` | -| `health_check` | object | Required for `start_server` | `start_server` | How to verify the service is healthy | +| `health_check` | object | No | `start_server` | Legacy readiness probe. Deprecated: use `probes.readiness` | +| `probes` | object | No | All | Readiness and liveness probe configuration | | `depends_on` | object | No | All | Dependencies on other nodes | | `env` | object | No | All | Extra environment variables | | `outputs` | array or object | No | All | Output declarations (format varies by type) | | `sensitive_outputs` | array of strings | No | All | Output keys to mask and encrypt | | `url_template` | string | No | `start_server` | URL template override for this variant | | `on_stop` | string | No | All | Teardown command run when the environment is stopped | -| `verify` | string | No | `command` only | Idempotency verification command | +| `skip_if` | string | No | `command` only | Idempotency check — skip if exits 0 (alias: `verify`)| | `client_log_levels` | array of strings | No | `start_server` | Browser log levels override for this variant | | `features` | object | No | `start_server` | Feature toggles override for this variant | @@ -343,7 +344,7 @@ Runs a shell command or script to completion. Used for setup tasks such as datab - Must specify either `command` or `script` (mutually exclusive) - Can declare outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged — exposes values in terminal/logs) - Built-in output: `exit_code` -- Supports the `verify` field for idempotency +- Supports the `skip_if` field for idempotency ```json { @@ -363,7 +364,7 @@ Starts and manages a long-lived process. Veld allocates a port, injects it as `$ - Built-in outputs: `url` (the full HTTPS URL) and `port` (the allocated port number) - Built-in variables: `${veld.port}` and `${veld.url}` are available in this node's `command`, `env`, and `outputs` templates - Ports and URLs are **pre-computed** before any node executes, so `${nodes.X.url}` and `${nodes.X.port}` for any `start_server` node are available everywhere -- no dependency edge required -- Supports the `health_check` field (required) +- Requires a readiness probe: use `probes.readiness` (preferred) or the legacy `health_check` field - Users never see or deal with port numbers -- only clean HTTPS URLs ```json @@ -451,6 +452,51 @@ Runs a shell command and checks the exit code. Exit code `0` means healthy. } ``` +### `probes` + +Configures readiness and liveness probes for a variant. Available for both `command` and `start_server` types. `probes.readiness` supersedes the legacy `health_check` field. + +```json +"probes": { + "readiness": { + "type": "http", + "path": "/health", + "timeout_seconds": 30 + }, + "liveness": { + "type": "command", + "command": "pg_isready -h localhost -p 5432", + "interval_ms": 5000, + "failure_threshold": 3, + "max_recoveries": 3 + } +} +``` + +#### Readiness Probe + +Gates the dependency graph during startup. Same fields as `health_check`. For `start_server` nodes, runs after the process starts. For `command` nodes, runs after the command exits 0. + +#### Liveness Probe + +Runs continuously after the node becomes healthy. Detects failures like dropped SSH tunnels, crashed background processes, or unreachable databases. Supports the same three check types as readiness probes: + +- **`http`**: Polls an HTTP endpoint. Passes when the expected status code is returned. +- **`port`**: Checks if a TCP port is accepting connections. +- **`command`**: Runs an arbitrary shell command (via `sh -c`). Exit code `0` means healthy, non-zero means unhealthy. Pipes, redirects, and `&&` chains all work. The node's outputs are injected as environment variables, so you can reference them directly (e.g., `pg_isready -h $DB_HOST -p $DB_PORT`). + +| Field | Type | Required | Description | +|---------------------|---------|----------|--------------------------------------------------------------| +| `type` | string | Yes | Strategy: `"http"`, `"port"`, or `"command"` | +| `path` | string | No | HTTP path to poll (`http` type only) | +| `expect_status` | integer | No | Expected HTTP status code (`http` type only, default: 200) | +| `command` | string | No | Shell command to run (`command` type only) | +| `interval_ms` | integer | No | Milliseconds between checks (default: 5000, min: 1000) | +| `failure_threshold` | integer | No | Consecutive failures before triggering recovery (default: 3) | +| `max_recoveries` | integer | No | Max recovery attempts before permanent failure (default: 3) | + +When `failure_threshold` consecutive liveness checks fail, Veld automatically restarts the entire environment (equivalent to `veld restart`). If the restart succeeds and the probe starts passing, the node returns to healthy. If `max_recoveries` restart attempts are exhausted, the node is marked as permanently failed and no further restarts are attempted. You can see recovery status via `veld status` and `veld logs --source internal`. + ### `depends_on` Declares dependencies as explicit `node:variant` pairs. Dependencies are resolved before this variant starts. The value is an object mapping node names to variant names. @@ -554,21 +600,21 @@ An array of output key names whose values are sensitive. These outputs are: } ``` -### `verify` +### `skip_if` -An idempotency verification command. Only applies to `command` type variants. Before running the main command/script, Veld executes the verify command: +An idempotency check command (previously named `verify`, which is still accepted as an alias). Only applies to `command` type variants. Before running the main command/script, Veld executes the `skip_if` command: - **Exit code 0:** The step is considered already complete and is skipped. - **Non-zero exit code:** The step runs normally. -- If `verify` itself errors unexpectedly, the step re-runs (safe default). +- If `skip_if` itself errors unexpectedly, the step re-runs (safe default). -The verify command receives the previous run's output variables as environment variables, so it can check whether the previous result is still valid. +The `skip_if` command receives the previous run's output variables as environment variables, so it can check whether the previous result is still valid. ```json { "type": "command", "script": "./scripts/clone-db.sh", - "verify": "./scripts/verify-db.sh", + "skip_if": "./scripts/verify-db.sh", "outputs": ["DATABASE_URL"] } ``` @@ -851,8 +897,8 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "my-project", "url_template": "{service}.{branch ?? run}.my-project.localhost", @@ -868,7 +914,7 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr "local": { "type": "command", "script": "./scripts/clone-db.sh", - "verify": "./scripts/verify-db.sh", + "skip_if": "./scripts/verify-db.sh", "on_stop": "./scripts/drop-db.sh", "outputs": ["DATABASE_URL"], "sensitive_outputs": ["DATABASE_URL"] @@ -894,7 +940,7 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr "default": { "type": "command", "command": "./scripts/generate-dev-certs.sh", - "verify": "test -f ./certs/dev.pem" + "skip_if": "test -f ./certs/dev.pem" } } }, @@ -1055,7 +1101,7 @@ Veld provides a JSON Schema for editor autocompletion and validation. Add the `$ ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", ... } ``` diff --git a/docs/scenarios.md b/docs/scenarios.md index b7cc787..72bef02 100644 --- a/docs/scenarios.md +++ b/docs/scenarios.md @@ -35,8 +35,8 @@ For the full field reference, see [configuration.md](./configuration.md). ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "shopfront", "url_template": "{service}.{branch ?? run}.shopfront.localhost", @@ -47,7 +47,7 @@ For the full field reference, see [configuration.md](./configuration.md). "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=shopfront -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/shopfront" } @@ -60,7 +60,7 @@ For the full field reference, see [configuration.md](./configuration.md). "local": { "type": "start_server", "command": "pnpm --filter @shopfront/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "database": "docker" }, "env": { "DATABASE_URL": "${nodes.database.DATABASE_URL}", @@ -75,7 +75,7 @@ For the full field reference, see [configuration.md](./configuration.md). "local": { "type": "start_server", "command": "pnpm --filter @shopfront/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "local" }, "env": { "PORT": "${veld.port}", @@ -98,8 +98,8 @@ For the full field reference, see [configuration.md](./configuration.md). ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "portal", "url_template": "{service}.{branch ?? run}.portal.localhost", @@ -109,7 +109,7 @@ For the full field reference, see [configuration.md](./configuration.md). "local": { "type": "start_server", "command": "pnpm --filter @portal/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "env": { "CORS_ORIGIN": "${nodes.frontend.url}", "ALLOWED_ORIGINS": "${nodes.frontend.url},${nodes.admin.url}" @@ -123,7 +123,7 @@ For the full field reference, see [configuration.md](./configuration.md). "local": { "type": "start_server", "command": "pnpm --filter @portal/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_API_URL": "${nodes.backend.url}" @@ -137,7 +137,7 @@ For the full field reference, see [configuration.md](./configuration.md). "local": { "type": "start_server", "command": "pnpm --filter @portal/admin dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_API_URL": "${nodes.backend.url}" @@ -161,8 +161,8 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "taskboard", "url_template": "{service}.{branch ?? run}.taskboard.localhost", @@ -173,12 +173,12 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=taskboard -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { + "probes": { "readiness": { "type": "command", "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres", "timeout_seconds": 30, "interval_ms": 2000 - }, + } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/taskboard" } @@ -192,11 +192,11 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "type": "start_server", "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine", "on_stop": "docker stop veld-redis-${veld.run}", - "health_check": { + "probes": { "readiness": { "type": "command", "command": "docker exec veld-redis-${veld.run} redis-cli ping", "timeout_seconds": 15 - }, + } }, "outputs": { "REDIS_URL": "redis://localhost:${veld.port}/0" } @@ -209,7 +209,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "local": { "type": "start_server", "command": "cargo run --bin taskboard-api -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }, + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "depends_on": { "postgres": "docker", "redis": "docker" @@ -235,8 +235,8 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "dashboard", "url_template": "{service}.{branch ?? run}.dashboard.localhost", @@ -252,7 +252,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=dashboard -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/dashboard" } @@ -266,7 +266,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "local": { "type": "start_server", "command": "pnpm --filter @dashboard/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "database": "docker" }, "env": { "DATABASE_URL": "${nodes.database.DATABASE_URL}" @@ -286,7 +286,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "local": { "type": "start_server", "command": "pnpm --filter @dashboard/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "local" }, "env": { "PORT": "${veld.port}", @@ -296,7 +296,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs "staging": { "type": "start_server", "command": "pnpm --filter @dashboard/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "staging" }, "env": { "PORT": "${veld.port}", @@ -324,8 +324,8 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "analytics", "url_template": "{service}.{branch ?? run}.analytics.localhost", @@ -336,11 +336,11 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 -v veld-pg-data-${veld.run}:/var/lib/postgresql/data postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { + "probes": { "readiness": { "type": "command", "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres", "timeout_seconds": 30 - }, + } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/analytics" } @@ -354,7 +354,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "default": { "type": "command", "command": "pg_dump $SOURCE_DB_URL | psql ${nodes.postgres.DATABASE_URL}", - "verify": "psql ${nodes.postgres.DATABASE_URL} -c 'SELECT 1 FROM users LIMIT 1'", + "skip_if": "psql ${nodes.postgres.DATABASE_URL} -c 'SELECT 1 FROM users LIMIT 1'", "depends_on": { "postgres": "docker" }, "env": { "SOURCE_DB_URL": "postgresql://readonly:secret@staging.analytics.example.com:5432/analytics" @@ -370,7 +370,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "pnpm --filter @analytics/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "clone-db": "default" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" @@ -382,7 +382,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin } ``` -**What happens:** The `clone-db` node depends on `postgres:docker`. Before running the expensive `pg_dump | psql` pipeline, Veld executes the `verify` command. If the `users` table already has data (`SELECT 1` succeeds), the clone is skipped entirely. On the first run, the clone executes. On subsequent runs, it is a no-op. The `hidden: true` flag keeps `clone-db` out of `veld nodes` output since it is an internal concern. +**What happens:** The `clone-db` node depends on `postgres:docker`. Before running the expensive `pg_dump | psql` pipeline, Veld executes the `skip_if` command. If the `users` table already has data (`SELECT 1` succeeds), the clone is skipped entirely. On the first run, the clone executes. On subsequent runs, it is a no-op. The `hidden: true` flag keeps `clone-db` out of `veld nodes` output since it is an internal concern. --- @@ -392,8 +392,8 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "rideshare", "url_template": "{service}.{branch ?? run}.rideshare.localhost", @@ -410,7 +410,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/rideshare" } @@ -424,7 +424,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "type": "start_server", "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine", "on_stop": "docker stop veld-redis-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 15 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } }, "outputs": { "REDIS_URL": "redis://localhost:${veld.port}/0" } @@ -437,7 +437,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "cargo run --bin rider-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }, + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "depends_on": { "postgres": "docker", "redis": "docker" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}", @@ -453,7 +453,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "cargo run --bin driver-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }, + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "depends_on": { "postgres": "docker", "redis": "docker" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}", @@ -468,7 +468,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "cargo run --bin pricing-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }, + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "depends_on": { "redis": "docker" }, "env": { "REDIS_URL": "${nodes.redis.REDIS_URL}" @@ -482,7 +482,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "cargo run --bin notification-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }, + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "depends_on": { "redis": "docker" }, "env": { "REDIS_URL": "${nodes.redis.REDIS_URL}" @@ -496,7 +496,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin "local": { "type": "start_server", "command": "cargo run --bin gateway -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "rider-service": "local", "driver-service": "local", @@ -535,8 +535,8 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "enterprise-app", "url_template": "{service}.{branch ?? run}.enterprise-app.localhost", @@ -547,11 +547,11 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=enterprise -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { + "probes": { "readiness": { "type": "command", "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres", "timeout_seconds": 30 - }, + } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/enterprise" } @@ -565,7 +565,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "default": { "type": "command", "command": "./scripts/generate-dev-certs.sh", - "verify": "test -f ./certs/dev.pem && test -f ./certs/dev-key.pem", + "skip_if": "test -f ./certs/dev.pem && test -f ./certs/dev-key.pem", "outputs": ["CERT_PATH", "KEY_PATH"] } } @@ -581,7 +581,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" }, - "verify": "pnpm --filter @enterprise/db migrate:status --exit-code" + "skip_if": "pnpm --filter @enterprise/db migrate:status --exit-code" } } }, @@ -596,7 +596,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" }, - "verify": "pnpm --filter @enterprise/db seed:check" + "skip_if": "pnpm --filter @enterprise/db seed:check" } } }, @@ -606,7 +606,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "local": { "type": "start_server", "command": "pnpm --filter @enterprise/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "seed-db": "default", "generate-certs": "default" @@ -625,7 +625,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se "local": { "type": "start_server", "command": "pnpm --filter @enterprise/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "local" }, "env": { "PORT": "${veld.port}", @@ -641,12 +641,12 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se **What happens:** 1. `postgres:docker` and `generate-certs:default` start in parallel (independent). -2. Once Postgres is ready, `migrate-db` runs (skipped if migrations are current, thanks to `verify`). +2. Once Postgres is ready, `migrate-db` runs (skipped if migrations are current, thanks to `skip_if`). 3. Once migrations complete, `seed-db` runs (skipped if seed data exists). 4. Once both `seed-db` and `generate-certs` finish, `backend:local` starts. 5. Finally, `frontend:local` starts. -The `verify` commands on the setup nodes make subsequent `veld start` calls fast -- if certs exist, migrations are current, and seed data is present, all three setup steps are skipped in milliseconds. +The `skip_if` commands on the setup nodes make subsequent `veld start` calls fast -- if certs exist, migrations are current, and seed data is present, all three setup steps are skipped in milliseconds. --- @@ -656,8 +656,8 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "search-platform", "url_template": "{service}.{branch ?? run}.search-platform.localhost", @@ -668,7 +668,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=search_platform -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/search_platform", "JDBC_URL": "jdbc:postgresql://localhost:${veld.port}/search_platform" @@ -683,7 +683,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "type": "start_server", "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine --appendonly yes", "on_stop": "docker stop veld-redis-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 15 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } }, "outputs": { "REDIS_URL": "redis://localhost:${veld.port}/0", "REDIS_CACHE_URL": "redis://localhost:${veld.port}/1", @@ -699,12 +699,12 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "type": "start_server", "command": "docker run --rm --name veld-es-${veld.run} -e discovery.type=single-node -e xpack.security.enabled=false -e ES_JAVA_OPTS='-Xms512m -Xmx512m' -p ${veld.port}:9200 elasticsearch:8.13.0", "on_stop": "docker stop veld-es-${veld.run}", - "health_check": { + "probes": { "readiness": { "type": "http", "path": "/_cluster/health", "timeout_seconds": 90, "interval_ms": 3000 - }, + } } }, "outputs": { "ELASTICSEARCH_URL": "http://localhost:${veld.port}" } @@ -717,12 +717,12 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "local": { "type": "start_server", "command": "pnpm --filter @search-platform/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "postgres": "docker", "redis": "docker", "elasticsearch": "docker" - }, + } }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}", "REDIS_URL": "${nodes.redis.REDIS_URL}", @@ -747,8 +747,8 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "crm", "url_template": "{service}.{branch ?? run}.crm.localhost", @@ -759,7 +759,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.branch}-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=crm_${veld.branch} -p ${veld.port}:5432 -v veld-pg-${veld.branch}:/var/lib/postgresql/data postgres:16", "on_stop": "docker stop veld-pg-${veld.branch}-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/crm_${veld.branch}" } @@ -772,7 +772,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "local": { "type": "start_server", "command": "pnpm --filter @crm/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "database": "docker" }, "env": { "DATABASE_URL": "${nodes.database.DATABASE_URL}" @@ -786,7 +786,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast "local": { "type": "start_server", "command": "pnpm --filter @crm/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "local" }, "env": { "PORT": "${veld.port}", @@ -815,8 +815,8 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "inventory", "url_template": "{service}.{username}.{branch ?? run}.inventory.localhost", @@ -826,7 +826,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab "local": { "type": "start_server", "command": "go run ./cmd/server --port ${veld.port}", - "health_check": { "type": "http", "path": "/healthz" }, + "probes": { "readiness": { "type": "http", "path": "/healthz" } } }, "env": { "FRONTEND_URL": "${nodes.frontend.url}" } @@ -839,7 +839,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab "local": { "type": "start_server", "command": "pnpm --filter @inventory/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_API_URL": "${nodes.backend.url}" @@ -861,8 +861,8 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "fintech", "url_template": "{service}.{branch ?? run}.fintech.localhost", @@ -873,7 +873,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab "default": { "type": "command", "script": "./scripts/provision-dev-db.sh", - "verify": "./scripts/check-dev-db.sh", + "skip_if": "./scripts/check-dev-db.sh", "on_stop": "./scripts/teardown-dev-db.sh", "outputs": ["DATABASE_URL", "DB_PASSWORD", "DB_READONLY_URL"], "sensitive_outputs": ["DATABASE_URL", "DB_PASSWORD", "DB_READONLY_URL"] @@ -898,7 +898,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab "local": { "type": "start_server", "command": "pnpm --filter @fintech/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "provision-db": "default", "fetch-api-keys": "default" @@ -931,8 +931,8 @@ The `api` node receives the secrets as environment variables at runtime. ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "saas-platform", "url_template": "{service}.{branch ?? run}.saas-platform.test", @@ -942,7 +942,7 @@ The `api` node receives the secrets as environment variables at runtime. "local": { "type": "start_server", "command": "go run ./cmd/api --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "env": { "COOKIE_DOMAIN": ".saas-platform.test", "CORS_ORIGINS": "${nodes.web.url},${nodes.admin.url}" @@ -956,7 +956,7 @@ The `api` node receives the secrets as environment variables at runtime. "local": { "type": "start_server", "command": "pnpm --filter @saas/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_API_URL": "${nodes.api.url}" @@ -971,7 +971,7 @@ The `api` node receives the secrets as environment variables at runtime. "local": { "type": "start_server", "command": "pnpm --filter @saas/admin dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_API_URL": "${nodes.api.url}" @@ -997,8 +997,8 @@ The `admin` node uses a node-level `url_template` override to produce a differen ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "marketplace", "url_template": "{service}.{branch ?? run}.marketplace.localhost", @@ -1015,7 +1015,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen "local": { "type": "start_server", "command": "cargo run --bin user-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 } + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "staging": { "type": "command", @@ -1031,7 +1031,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen "local": { "type": "start_server", "command": "cargo run --bin catalog-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 } + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "staging": { "type": "command", @@ -1047,7 +1047,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen "local": { "type": "start_server", "command": "cargo run --bin payment-service -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 } + "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } }, "staging": { "type": "command", @@ -1073,7 +1073,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen "local": { "type": "start_server", "command": "cargo run --bin gateway -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "user-service": "staging", "catalog-service": "local", @@ -1090,7 +1090,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen "local-full": { "type": "start_server", "command": "cargo run --bin gateway -- --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "user-service": "local", "catalog-service": "local", @@ -1122,8 +1122,8 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "docs-site", "url_template": "{service}.{branch ?? run}.docs-site.localhost", @@ -1134,7 +1134,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "default": { "type": "command", "command": "pnpm --filter @docs-site/content build", - "verify": "test -d ./packages/content/dist && test ./packages/content/dist/index.html -nt ./packages/content/src/index.md" + "skip_if": "test -d ./packages/content/dist && test ./packages/content/dist/index.html -nt ./packages/content/src/index.md" } } }, @@ -1145,7 +1145,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "default": { "type": "command", "command": "cargo build --release --bin docs-api", - "verify": "test -f ./target/release/docs-api && test ./target/release/docs-api -nt ./src/main.rs" + "skip_if": "test -f ./target/release/docs-api && test ./target/release/docs-api -nt ./src/main.rs" } } }, @@ -1155,7 +1155,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "local": { "type": "start_server", "command": "./target/release/docs-api --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "build-api": "default" } } } @@ -1166,7 +1166,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "local": { "type": "start_server", "command": "python3 -m http.server ${veld.port} --directory ./packages/content/dist", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "build-docs": "default" } } } @@ -1175,7 +1175,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ } ``` -**What happens:** The `build-docs` and `build-api` command nodes run first (in parallel, since they are independent). The `verify` commands check whether the build artifacts are newer than the source files -- if so, the builds are skipped. Then `docs:local` serves the static files and `api:local` runs the compiled binary. On first run, both builds execute. On subsequent runs, they are skipped unless source files changed. +**What happens:** The `build-docs` and `build-api` command nodes run first (in parallel, since they are independent). The `skip_if` commands check whether the build artifacts are newer than the source files -- if so, the builds are skipped. Then `docs:local` serves the static files and `api:local` runs the compiled binary. On first run, both builds execute. On subsequent runs, they are skipped unless source files changed. --- @@ -1185,8 +1185,8 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "jobrunner", "url_template": "{service}.{branch ?? run}.jobrunner.localhost", @@ -1202,7 +1202,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=jobrunner -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/jobrunner" } @@ -1216,7 +1216,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "type": "start_server", "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine", "on_stop": "docker stop veld-redis-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 15 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } }, "outputs": { "REDIS_URL": "redis://localhost:${veld.port}/0" } @@ -1229,7 +1229,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "local": { "type": "start_server", "command": "pnpm --filter @jobrunner/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "postgres": "docker", "redis": "docker" @@ -1248,7 +1248,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "local": { "type": "start_server", "command": "pnpm --filter @jobrunner/worker start --port ${veld.port} --concurrency 5", - "health_check": { "type": "http", "path": "/status" }, + "probes": { "readiness": { "type": "http", "path": "/status" } } }, "depends_on": { "postgres": "docker", "redis": "docker" @@ -1267,7 +1267,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$ "local": { "type": "start_server", "command": "pnpm --filter @jobrunner/scheduler start --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "redis": "docker" }, @@ -1300,8 +1300,8 @@ All three application nodes are `start_server` with health check endpoints, so V ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "polyglot", "url_template": "{service}.{branch ?? run}.polyglot.localhost", @@ -1312,7 +1312,7 @@ All three application nodes are `start_server` with health check endpoints, so V "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=polyglot -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/polyglot", "JDBC_URL": "jdbc:postgresql://localhost:${veld.port}/polyglot" @@ -1326,7 +1326,7 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "cd services/auth && go run . --port ${veld.port}", - "health_check": { "type": "http", "path": "/healthz" }, + "probes": { "readiness": { "type": "http", "path": "/healthz" } } }, "depends_on": { "postgres": "docker" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" @@ -1340,12 +1340,12 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "cd services/billing && ./gradlew bootRun --args='--server.port=${veld.port}'", - "health_check": { + "probes": { "readiness": { "type": "http", "path": "/actuator/health", "timeout_seconds": 90, "interval_ms": 3000 - }, + } } }, "depends_on": { "postgres": "docker" }, "env": { "SPRING_DATASOURCE_URL": "${nodes.postgres.JDBC_URL}", @@ -1361,7 +1361,7 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "cd services/recommendations && uvicorn main:app --host 0.0.0.0 --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "postgres": "docker" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" @@ -1375,12 +1375,12 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "pnpm --filter @polyglot/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "auth-service": "local", "billing-service": "local", "recommendation-service": "local" - }, + } }, "env": { "PORT": "${veld.port}", "NEXT_PUBLIC_AUTH_URL": "${nodes.auth-service.url}", @@ -1404,8 +1404,8 @@ All three application nodes are `start_server` with health check endpoints, so V ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "e2e-suite", "url_template": "{service}.{branch ?? run}.e2e-suite.localhost", @@ -1421,7 +1421,7 @@ All three application nodes are `start_server` with health check endpoints, so V "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=e2e -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/e2e" } @@ -1448,7 +1448,7 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "pnpm --filter @e2e-suite/api dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "seed-test-data": "default" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}", @@ -1463,7 +1463,7 @@ All three application nodes are `start_server` with health check endpoints, so V "local": { "type": "start_server", "command": "pnpm --filter @e2e-suite/web dev", - "health_check": { "type": "http", "path": "/" }, + "probes": { "readiness": { "type": "http", "path": "/" } } }, "depends_on": { "backend": "local" }, "env": { "PORT": "${veld.port}", @@ -1512,8 +1512,8 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne ```json { - "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json", - "schemaVersion": "1", + "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json", + "schemaVersion": "2", "name": "platform", "url_template": "{service}.{branch ?? run}.platform.localhost", @@ -1535,7 +1535,7 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne "type": "start_server", "command": "docker run --rm --name veld-pg-${veld.run} --network ${veld.project}-net -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 postgres:16", "on_stop": "docker stop veld-pg-${veld.run}", - "health_check": { "type": "port", "timeout_seconds": 30 }, + "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } }, "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/app" } @@ -1547,7 +1547,7 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne "local": { "type": "start_server", "command": "pnpm --filter backend dev --port ${veld.port}", - "health_check": { "type": "http", "path": "/health" }, + "probes": { "readiness": { "type": "http", "path": "/health" } } }, "depends_on": { "postgres": "docker" }, "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" } } diff --git a/install.sh b/install.sh index c355398..e405ed6 100755 --- a/install.sh +++ b/install.sh @@ -210,6 +210,71 @@ fi # --- Install --- +# --- Stop running environments (prevents stale state files after upgrade) --- +# +# When upgrading across major versions, state file format may change. +# Stop all running environments using the OLD binary so state is cleanly +# removed before the new binary is installed. + +EXISTING_VELD_BIN="$(command -v veld 2>/dev/null || true)" +if [ -n "$EXISTING_VELD_BIN" ]; then + # Use `veld list --json` to find running environments across ALL projects. + # Parse with basic grep — no jq dependency required. + LIST_JSON="$("$EXISTING_VELD_BIN" list --json 2>/dev/null || true)" + + # Extract project_root + run_name pairs for running environments. + # The JSON structure is: { "projects": { "": { "runs": { "": { "status": "running" } } } } } + RUNNING_INFO="" + if [ -n "$LIST_JSON" ]; then + # Use python3 (available on macOS and most Linux) for reliable JSON parsing. + RUNNING_INFO="$(echo "$LIST_JSON" | python3 -c ' +import json, sys +try: + data = json.load(sys.stdin) + for path, proj in data.get("projects", {}).items(): + for name, run in proj.get("runs", {}).items(): + if run.get("status") == "running": + print(f"{path}\t{name}") +except: pass +' 2>/dev/null || true)" + fi + + if [ -n "$RUNNING_INFO" ]; then + echo "" + echo "============================================================" + echo " RUNNING ENVIRONMENTS DETECTED" + echo "============================================================" + echo "" + echo " The following environments will be stopped before updating" + echo " (prevents stale state files after upgrade):" + echo "" + echo "$RUNNING_INFO" | while IFS=$'\t' read -r proj_root run_name; do + echo " - ${run_name} (${proj_root})" + done + echo "" + + if [ -z "${VELD_NON_INTERACTIVE:-}" ] && [ -t 0 ]; then + printf " Stop all and continue? [Y/n] " + read -r answer < /dev/tty 2>/dev/null || answer="y" + answer="${answer:-y}" + if [ "$answer" != "y" ] && [ "$answer" != "Y" ]; then + echo "Update cancelled." + exit 0 + fi + fi + + echo " Stopping environments..." + echo "$RUNNING_INFO" | while IFS=$'\t' read -r proj_root run_name; do + if (cd "$proj_root" 2>/dev/null && "$EXISTING_VELD_BIN" stop --name "$run_name" 2>/dev/null); then + echo " Stopped '${run_name}'" + else + echo " Warning: could not stop '${run_name}' (may need manual cleanup)" + fi + done + echo "" + fi +fi + echo "Installing binaries..." $NEED_SUDO mkdir -p "$INSTALL_DIR" $NEED_SUDO mkdir -p "$LIB_DIR" diff --git a/schema/v1/veld.schema.json b/schema/v1/veld.schema.json index 08ee57e..c5caf28 100644 --- a/schema/v1/veld.schema.json +++ b/schema/v1/veld.schema.json @@ -46,28 +46,6 @@ "$ref": "#/$defs/ClientLogLevels", "description": "Client-side log levels to capture (project-level default). Overridable at node and variant level." }, - "features": { - "$ref": "#/$defs/FeaturesConfig", - "description": "Feature toggles (project-level defaults). Overridable at node and variant level." - }, - "env": { - "$ref": "#/$defs/EnvMap", - "description": "Global environment variables inherited by all node variants. Overridable at node and variant level." - }, - "setup": { - "type": "array", - "description": "Steps that run sequentially before the dependency graph executes. If any step exits non-zero, startup is aborted. Not nodes — no variants, no health checks, no dependency graph participation.", - "items": { - "$ref": "#/$defs/SetupStep" - } - }, - "teardown": { - "type": "array", - "description": "Steps that run sequentially after all nodes stop (after per-node on_stop hooks). Best-effort: failures are logged but do not block the stop operation. Commands should be idempotent.", - "items": { - "$ref": "#/$defs/SetupStep" - } - }, "nodes": { "type": "object", "description": "The dependency graph nodes. Each key is a node name, and the value defines its variants.", @@ -78,26 +56,6 @@ } }, "$defs": { - "SetupStep": { - "type": "object", - "description": "A lightweight step that runs before the dependency graph (setup) or after all nodes stop (teardown). Supports shell environment variables and project-level Veld variables: ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}.", - "required": ["name", "command"], - "additionalProperties": false, - "properties": { - "name": { - "type": "string", - "description": "Human-readable name for progress reporting and error messages." - }, - "command": { - "type": "string", - "description": "Shell command to execute. Supports ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}, and shell environment variables." - }, - "failureMessage": { - "type": "string", - "description": "Optional human-friendly message shown when the command fails (non-zero exit). Useful for prerequisite checks like 'Docker must be running'." - } - } - }, "NodeConfig": { "type": "object", "description": "Configuration for a single node in the dependency graph.", @@ -121,18 +79,6 @@ "$ref": "#/$defs/ClientLogLevels", "description": "Client-side log levels override for all variants of this node." }, - "features": { - "$ref": "#/$defs/FeaturesConfig", - "description": "Feature toggles override for all variants of this node." - }, - "env": { - "$ref": "#/$defs/EnvMap", - "description": "Extra environment variables inherited by all variants of this node. Overrides project-level env. Overridable at variant level." - }, - "cwd": { - "type": "string", - "description": "Working directory for all variants of this node. Relative paths are resolved from the project root. Overridable at variant level. Supports Veld variable substitution." - }, "variants": { "type": "object", "description": "Available variants for this node. Each key is a variant name.", @@ -174,11 +120,14 @@ } }, "env": { - "$ref": "#/$defs/EnvMap", - "description": "Extra environment variables injected into the process. Overrides node-level and project-level env." + "type": "object", + "description": "Extra environment variables injected into the process.", + "additionalProperties": { + "type": "string" + } }, "outputs": { - "description": "Output declarations. For command steps: an array of output names captured from $VELD_OUTPUT_FILE (preferred) or VELD_OUTPUT stdout lines (legacy). For start_server steps: an object mapping output names to template strings.", + "description": "Output declarations. For command steps: an array of output names captured from VELD_OUTPUT. For start_server steps: an object mapping output names to template strings.", "oneOf": [ { "type": "array", @@ -206,7 +155,7 @@ "strict_outputs": { "type": "boolean", "default": true, - "description": "When true (default), fail if the command produces output keys (via $VELD_OUTPUT_FILE or stdout) not declared in \"outputs\". Set to false to silently ignore undeclared outputs." + "description": "When true (default), fail if the command produces VELD_OUTPUT keys not declared in \"outputs\". Set to false to silently ignore undeclared outputs." }, "verify": { "type": "string", @@ -223,14 +172,6 @@ "client_log_levels": { "$ref": "#/$defs/ClientLogLevels", "description": "Client-side log levels override for this specific variant." - }, - "features": { - "$ref": "#/$defs/FeaturesConfig", - "description": "Feature toggles override for this specific variant." - }, - "cwd": { - "type": "string", - "description": "Working directory for this variant. Relative paths are resolved from the project root. Overrides node-level cwd. Supports Veld variable substitution." } }, "allOf": [ @@ -257,28 +198,6 @@ } ] }, - "FeaturesConfig": { - "type": "object", - "description": "Feature toggles for controlling which Veld capabilities are injected into a server's HTML responses. All properties are optional — omitted values inherit from the parent level (variant inherits from node, node from project). The built-in default for all features is true (enabled).", - "additionalProperties": false, - "properties": { - "feedback_overlay": { - "type": "boolean", - "description": "Inject the feedback overlay toolbar (FAB, screenshot, comments) into HTML responses. Default: true.", - "default": true - }, - "client_logs": { - "type": "boolean", - "description": "Inject the client-side log collector into HTML responses. Default: true.", - "default": true - }, - "inject": { - "type": "boolean", - "description": "Automatically inject bootstrap scripts into HTML responses. When false, the /__veld__/* proxy routes are still created so you can manually add