diff --git a/AGENTS.md b/AGENTS.md
index 0c8904a..4297740 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -73,9 +73,9 @@ When a change introduces new config fields, CLI flags, subcommands, or user-visi
 |------|----------------|
 | `README.md` | Features list, CLI reference table, Configuration section |
 | `docs/configuration.md` | Config field reference (top-level table, field section, variant table) |
-| `skills/veld-config/SKILL.md` | Agent-facing config reference |
-| `skills/veld-usage/SKILL.md` | Agent-facing CLI reference |
-| `schema/v1/veld.schema.json` | JSON Schema (usually updated in code, but verify) |
+| `skills/veld/SKILL.md` | Agent-facing skill (quick reference, gotchas) |
+| `skills/veld/reference/config.md` | Agent-facing config reference |
+| `schema/v2/veld.schema.json` | JSON Schema for v2 configs (probes, recovery, skip_if) |
 | `website/llms-full.txt` | LLM-facing docs (if applicable, see `website/AGENTS.md`) |
 
 If the change is purely internal (refactor, bugfix with no new surface area), this checklist does not apply.
diff --git a/Cargo.lock b/Cargo.lock
index c7cad96..d70ff4f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1862,7 +1862,7 @@ checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
 [[package]]
 name = "veld"
-version = "6.4.0"
+version = "6.5.1"
 dependencies = [
  "anyhow",
  "chrono",
@@ -1882,7 +1882,7 @@ dependencies = [
 
 [[package]]
 name = "veld-core"
-version = "6.4.0"
+version = "6.5.1"
 dependencies = [
  "anyhow",
  "base64",
@@ -1904,13 +1904,14 @@ dependencies = [
 
 [[package]]
 name = "veld-daemon"
-version = "6.4.0"
+version = "6.5.1"
 dependencies = [
  "anyhow",
  "axum",
  "chrono",
  "dirs",
  "libc",
+ "reqwest",
  "serde",
  "serde_json",
  "thiserror",
@@ -1923,7 +1924,7 @@ dependencies = [
 
 [[package]]
 name = "veld-helper"
-version = "6.4.0"
+version = "6.5.1"
 dependencies = [
  "anyhow",
  "nix",
diff --git a/PRD.md b/PRD.md
index c4c5336..39bfd19 100644
--- a/PRD.md
+++ b/PRD.md
@@ -724,7 +724,7 @@ No cross-compilation until v1 is stable. No Tauri. No GTK. No npm in CI.
         "local": {
           "type": "command",
           "script": "./scripts/clone-db.sh",
-          "verify": "./scripts/verify-db.sh",
+          "skip_if": "./scripts/verify-db.sh",
           "outputs": ["DATABASE_URL"],
           "sensitive_outputs": ["DATABASE_URL"]
         },
diff --git a/README.md b/README.md
index 2542d43..d4022f0 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,8 @@ No port numbers. No manual wiring. Just clean, stable, human-readable URLs.
 - **No port numbers** — work with stable HTTPS URLs instead of `localhost:3847`
 - **Dependency graph** — resolves node dependencies, parallelizes startup, reverse-order teardown
 - **TLS by default** — Caddy's internal CA handles TLS termination, auto-trusted during setup
-- **Health checks** — two-phase checks (TCP port + HTTP endpoint) before marking services healthy
+- **Health checks** — readiness probes (two-phase: TCP port + HTTP/command) gate startup; liveness probes detect failures after startup (e.g., dropped SSH tunnels)
+- **Automatic recovery** — when liveness probes detect failure, the environment is automatically restarted (configurable failure threshold and max recovery attempts)
 - **Multiple variants** — same node, different behaviors (local server, Docker, remote URL)
 - **Named runs** — multiple environments coexist; re-running by name is idempotent
 - **Setup / teardown** — project-level lifecycle steps that gate startup (check Docker, create networks) and clean up after stop
@@ -26,6 +27,7 @@ No port numbers. No manual wiring. Just clean, stable, human-readable URLs.
 - **Structured output** — all commands support `--json` for scripting and CI
 - **Browser dashboard** — management UI at `https://veld.localhost` with service health, logs, search, stop/restart
 - **Client-side logs** — captures browser `console.log/warn/error`, exceptions, and promise rejections; view with `veld logs --source client`
+- **Internal logs** — liveness probe outcomes (with stderr), recovery decisions, health state transitions; view with `veld logs --source internal`
 
 ## Install
 
@@ -72,8 +74,8 @@ cargo build --release
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "myproject",
   "url_template": "{service}.{run}.{project}.localhost",
   "nodes": {
@@ -83,7 +85,7 @@ cargo build --release
         "local": {
           "type": "start_server",
           "command": "npm run dev -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 30 }
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 30 } }
         }
       }
     },
@@ -93,7 +95,7 @@ cargo build --release
         "local": {
           "type": "start_server",
           "command": "npm run dev -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "http", "path": "/", "timeout_seconds": 30 } },
           "depends_on": { "backend": "local" },
           "env": { "NEXT_PUBLIC_API_URL": "${nodes.backend.url}" }
         }
@@ -152,7 +154,7 @@ veld stop --name dev
 ### Step types
 
 - **`start_server`** — long-running process. Veld allocates a port (`${veld.port}`), starts the process, and runs health checks.
-- **`command`** — runs a command to completion. Can emit outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged). Optional `verify` command for idempotency.
+- **`command`** — runs a command to completion. Can emit outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged). Optional `skip_if` command for idempotency.
 
 ### Setup & teardown
 
diff --git a/crates/veld-core/src/config.rs b/crates/veld-core/src/config.rs
index 4f78c4b..78c36b0 100644
--- a/crates/veld-core/src/config.rs
+++ b/crates/veld-core/src/config.rs
@@ -165,10 +165,16 @@ pub struct VariantConfig {
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub script: Option<String>,
 
-    /// Health check configuration (start_server only).
+    /// Legacy health check configuration (start_server only).
+    /// Deprecated: use `probes.readiness` instead.
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub health_check: Option<HealthCheck>,
 
+    /// Readiness and liveness probe configuration.
+    /// `probes.readiness` supersedes the legacy `health_check` field.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub probes: Option<ProbesConfig>,
+
     /// Dependencies: node name -> variant name.
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub depends_on: Option<HashMap<String, String>>,
@@ -193,9 +199,10 @@ pub struct VariantConfig {
     #[serde(default = "default_strict_outputs")]
     pub strict_outputs: bool,
 
-    /// Idempotency verify command (command steps only).
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub verify: Option<String>,
+    /// Idempotency check — skip this command step if this command exits 0.
+    /// Previously named `verify` (still accepted for backward compatibility).
+    #[serde(default, skip_serializing_if = "Option::is_none", alias = "verify")]
+    pub skip_if: Option<String>,
 
     /// Optional URL template override for this specific variant.
     #[serde(default, skip_serializing_if = "Option::is_none")]
@@ -414,6 +421,86 @@ pub struct HealthCheck {
     pub interval_ms: u64,
 }
 
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+// Probes
+// ---------------------------------------------------------------------------
+
+/// Readiness and liveness probe configuration for a variant.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ProbesConfig {
+    /// Readiness probe — gates the dependency graph during startup.
+    /// Same semantics as the legacy `health_check` field.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub readiness: Option<HealthCheck>,
+
+    /// Liveness probe — runs continuously after the node is healthy.
+    /// Triggers recovery when `failure_threshold` consecutive checks fail.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub liveness: Option<LivenessProbe>,
+}
+
+/// Liveness probe configuration. Shares check-type fields with `HealthCheck`
+/// but adds failure thresholds and recovery limits.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LivenessProbe {
+    /// One of "http", "port", "command".
+    #[serde(rename = "type")]
+    pub check_type: String,
+
+    /// HTTP path for type "http".
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub path: Option<String>,
+
+    /// Expected HTTP status code.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub expect_status: Option<u16>,
+
+    /// Command for type "command".
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub command: Option<String>,
+
+    /// Milliseconds between liveness checks (default 5000).
+    #[serde(default = "default_liveness_interval")]
+    pub interval_ms: u64,
+
+    /// Consecutive failures before triggering recovery (default 3).
+    #[serde(default = "default_failure_threshold")]
+    pub failure_threshold: u32,
+
+    /// Maximum number of recovery attempts before permanent failure (default 3).
+    #[serde(default = "default_max_recoveries")]
+    pub max_recoveries: u32,
+}
+
+fn default_liveness_interval() -> u64 {
+    5000
+}
+
+fn default_failure_threshold() -> u32 {
+    3
+}
+
+fn default_max_recoveries() -> u32 {
+    3
+}
+
+impl VariantConfig {
+    /// Resolve the effective readiness probe: `probes.readiness` takes
+    /// precedence over the legacy `health_check` field.
+    pub fn readiness_probe(&self) -> Option<&HealthCheck> {
+        self.probes
+            .as_ref()
+            .and_then(|p| p.readiness.as_ref())
+            .or(self.health_check.as_ref())
+    }
+
+    /// Return the liveness probe if configured.
+    pub fn liveness_probe(&self) -> Option<&LivenessProbe> {
+        self.probes.as_ref().and_then(|p| p.liveness.as_ref())
+    }
+}
+
 fn default_strict_outputs() -> bool {
     true
 }
@@ -457,7 +544,7 @@ pub fn load_config(path: &Path) -> Result<VeldConfig, ConfigError> {
             source: e,
         })?;
 
-    if config.schema_version != "1" {
+    if config.schema_version != "1" && config.schema_version != "2" {
         return Err(ConfigError::UnsupportedSchemaVersion(
             config.schema_version.clone(),
         ));
@@ -862,4 +949,155 @@ mod tests {
         assert!(config.setup.is_none());
         assert!(config.teardown.is_none());
     }
+
+    // -- Probes config tests ---------------------------------------------------
+
+    #[test]
+    fn test_probes_config_deserialization() {
+        let json = r#"{
+            "readiness": {
+                "type": "http",
+                "path": "/health",
+                "timeout_seconds": 30,
+                "interval_ms": 500
+            },
+            "liveness": {
+                "type": "command",
+                "command": "pg_isready",
+                "interval_ms": 5000,
+                "failure_threshold": 5,
+                "max_recoveries": 2
+            }
+        }"#;
+        let probes: ProbesConfig = serde_json::from_str(json).unwrap();
+        let readiness = probes.readiness.unwrap();
+        assert_eq!(readiness.check_type, "http");
+        assert_eq!(readiness.path.as_deref(), Some("/health"));
+        assert_eq!(readiness.timeout_seconds, 30);
+
+        let liveness = probes.liveness.unwrap();
+        assert_eq!(liveness.check_type, "command");
+        assert_eq!(liveness.command.as_deref(), Some("pg_isready"));
+        assert_eq!(liveness.interval_ms, 5000);
+        assert_eq!(liveness.failure_threshold, 5);
+        assert_eq!(liveness.max_recoveries, 2);
+    }
+
+    #[test]
+    fn test_liveness_probe_defaults() {
+        let json = r#"{"type": "command", "command": "true"}"#;
+        let liveness: LivenessProbe = serde_json::from_str(json).unwrap();
+        assert_eq!(liveness.interval_ms, 5000);
+        assert_eq!(liveness.failure_threshold, 3);
+        assert_eq!(liveness.max_recoveries, 3);
+    }
+
+    // -- skip_if / verify alias tests ------------------------------------------
+
+    #[test]
+    fn test_skip_if_field() {
+        let json = r#"{
+            "type": "command",
+            "command": "echo run",
+            "skip_if": "test -f /tmp/done"
+        }"#;
+        let v: VariantConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(v.skip_if.as_deref(), Some("test -f /tmp/done"));
+    }
+
+    #[test]
+    fn test_verify_alias_for_skip_if() {
+        let json = r#"{
+            "type": "command",
+            "command": "echo run",
+            "verify": "test -f /tmp/done"
+        }"#;
+        let v: VariantConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(v.skip_if.as_deref(), Some("test -f /tmp/done"));
+    }
+
+    // -- Schema version tests --------------------------------------------------
+
+    #[test]
+    fn test_schema_version_2_accepted() {
+        let json = r#"{
+            "schemaVersion": "2",
+            "name": "test-project",
+            "nodes": {
+                "db": {
+                    "variants": {
+                        "local": {
+                            "type": "command",
+                            "command": "echo start",
+                            "probes": {
+                                "liveness": {
+                                    "type": "command",
+                                    "command": "pg_isready"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }"#;
+        let config: VeldConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.schema_version, "2");
+        let variant = &config.nodes["db"].variants["local"];
+        assert!(variant.probes.is_some());
+        let liveness = variant.liveness_probe().unwrap();
+        assert_eq!(liveness.check_type, "command");
+    }
+
+    // -- Readiness probe helper tests ------------------------------------------
+
+    #[test]
+    fn test_readiness_probe_from_probes() {
+        let json = r#"{
+            "type": "start_server",
+            "command": "npm start",
+            "probes": {
+                "readiness": {
+                    "type": "http",
+                    "path": "/health"
+                }
+            }
+        }"#;
+        let v: VariantConfig = serde_json::from_str(json).unwrap();
+        let probe = v.readiness_probe().unwrap();
+        assert_eq!(probe.check_type, "http");
+    }
+
+    #[test]
+    fn test_readiness_probe_fallback_to_health_check() {
+        let json = r#"{
+            "type": "start_server",
+            "command": "npm start",
+            "health_check": {
+                "type": "port"
+            }
+        }"#;
+        let v: VariantConfig = serde_json::from_str(json).unwrap();
+        let probe = v.readiness_probe().unwrap();
+        assert_eq!(probe.check_type, "port");
+    }
+
+    #[test]
+    fn test_readiness_probe_probes_overrides_health_check() {
+        let json = r#"{
+            "type": "start_server",
+            "command": "npm start",
+            "health_check": {
+                "type": "port"
+            },
+            "probes": {
+                "readiness": {
+                    "type": "http",
+                    "path": "/ready"
+                }
+            }
+        }"#;
+        let v: VariantConfig = serde_json::from_str(json).unwrap();
+        let probe = v.readiness_probe().unwrap();
+        assert_eq!(probe.check_type, "http");
+    }
 }
diff --git a/crates/veld-core/src/graph.rs b/crates/veld-core/src/graph.rs
index 755d765..f8123fc 100644
--- a/crates/veld-core/src/graph.rs
+++ b/crates/veld-core/src/graph.rs
@@ -374,6 +374,58 @@ fn validate_sensitive_outputs(
     Ok(())
 }
 
+/// Return all nodes that transitively depend on `target` within the given
+/// set of active nodes. The result is in topological order (direct dependents
+/// first, transitive dependents later).
+pub fn get_dependents(
+    target: &NodeSelection,
+    all_nodes: &[NodeSelection],
+    config: &VeldConfig,
+) -> Vec<NodeSelection> {
+    // Build reverse adjacency: for each node, which nodes depend on it.
+    let mut reverse_deps: HashMap<String, Vec<NodeSelection>> = HashMap::new();
+    for sel in all_nodes {
+        let variant_cfg = &config.nodes[&sel.node].variants[&sel.variant];
+        if let Some(dep_map) = &variant_cfg.depends_on {
+            for (dep_node, dep_variant) in dep_map {
+                let dep_key = format!("{dep_node}:{dep_variant}");
+                reverse_deps.entry(dep_key).or_default().push(sel.clone());
+            }
+        }
+    }
+
+    // BFS from target through reverse edges.
+    let mut visited: HashSet<String> = HashSet::new();
+    let mut queue: VecDeque<NodeSelection> = VecDeque::new();
+    let target_key = format!("{}:{}", target.node, target.variant);
+    visited.insert(target_key.clone());
+
+    if let Some(direct) = reverse_deps.get(&target_key) {
+        for dep in direct {
+            let key = format!("{}:{}", dep.node, dep.variant);
+            if visited.insert(key) {
+                queue.push_back(dep.clone());
+            }
+        }
+    }
+
+    let mut result = Vec::new();
+    while let Some(sel) = queue.pop_front() {
+        let key = format!("{}:{}", sel.node, sel.variant);
+        if let Some(further) = reverse_deps.get(&key) {
+            for dep in further {
+                let dep_key = format!("{}:{}", dep.node, dep.variant);
+                if visited.insert(dep_key) {
+                    queue.push_back(dep.clone());
+                }
+            }
+        }
+        result.push(sel);
+    }
+
+    result
+}
+
 fn check_string_for_ambiguous_refs(
     s: &str,
     active_variants: &HashMap<&str, Vec<&str>>,
@@ -410,3 +462,201 @@ fn check_string_for_ambiguous_refs(
     }
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::{NodeConfig, StepType, VariantConfig, VeldConfig};
+    use std::collections::HashMap;
+
+    fn make_config() -> VeldConfig {
+        // db -> api -> frontend (dependency chain)
+        let db_variant = VariantConfig {
+            step_type: StepType::Command,
+            command: Some("echo db".into()),
+            script: None,
+            health_check: None,
+            probes: None,
+            depends_on: None,
+            env: None,
+            outputs: None,
+            sensitive_outputs: None,
+            strict_outputs: true,
+            skip_if: None,
+            url_template: None,
+            on_stop: None,
+            client_log_levels: None,
+            features: None,
+            cwd: None,
+        };
+        let api_variant = VariantConfig {
+            step_type: StepType::StartServer,
+            command: Some("echo api".into()),
+            script: None,
+            health_check: None,
+            probes: None,
+            depends_on: Some(HashMap::from([("db".into(), "local".into())])),
+            env: None,
+            outputs: None,
+            sensitive_outputs: None,
+            strict_outputs: true,
+            skip_if: None,
+            url_template: None,
+            on_stop: None,
+            client_log_levels: None,
+            features: None,
+            cwd: None,
+        };
+        let frontend_variant = VariantConfig {
+            step_type: StepType::StartServer,
+            command: Some("echo fe".into()),
+            script: None,
+            health_check: None,
+            probes: None,
+            depends_on: Some(HashMap::from([("api".into(), "local".into())])),
+            env: None,
+            outputs: None,
+            sensitive_outputs: None,
+            strict_outputs: true,
+            skip_if: None,
+            url_template: None,
+            on_stop: None,
+            client_log_levels: None,
+            features: None,
+            cwd: None,
+        };
+
+        VeldConfig {
+            schema: None,
+            schema_version: "2".into(),
+            name: "test".into(),
+            url_template: "{service}.{run}.{project}.localhost".into(),
+            presets: None,
+            client_log_levels: None,
+            features: None,
+            env: None,
+            setup: None,
+            teardown: None,
+            nodes: HashMap::from([
+                (
+                    "db".into(),
+                    NodeConfig {
+                        default_variant: Some("local".into()),
+                        url_template: None,
+                        hidden: None,
+                        client_log_levels: None,
+                        features: None,
+                        env: None,
+                        cwd: None,
+                        variants: HashMap::from([("local".into(), db_variant)]),
+                    },
+                ),
+                (
+                    "api".into(),
+                    NodeConfig {
+                        default_variant: Some("local".into()),
+                        url_template: None,
+                        hidden: None,
+                        client_log_levels: None,
+                        features: None,
+                        env: None,
+                        cwd: None,
+                        variants: HashMap::from([("local".into(), api_variant)]),
+                    },
+                ),
+                (
+                    "frontend".into(),
+                    NodeConfig {
+                        default_variant: Some("local".into()),
+                        url_template: None,
+                        hidden: None,
+                        client_log_levels: None,
+                        features: None,
+                        env: None,
+                        cwd: None,
+                        variants: HashMap::from([("local".into(), frontend_variant)]),
+                    },
+                ),
+            ]),
+        }
+    }
+
+    #[test]
+    fn test_get_dependents_leaf_node() {
+        let config = make_config();
+        let all_nodes = vec![
+            NodeSelection {
+                node: "db".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "api".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "frontend".into(),
+                variant: "local".into(),
+            },
+        ];
+        let target = NodeSelection {
+            node: "frontend".into(),
+            variant: "local".into(),
+        };
+        let deps = get_dependents(&target, &all_nodes, &config);
+        assert!(deps.is_empty(), "leaf node should have no dependents");
+    }
+
+    #[test]
+    fn test_get_dependents_root_node() {
+        let config = make_config();
+        let all_nodes = vec![
+            NodeSelection {
+                node: "db".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "api".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "frontend".into(),
+                variant: "local".into(),
+            },
+        ];
+        let target = NodeSelection {
+            node: "db".into(),
+            variant: "local".into(),
+        };
+        let deps = get_dependents(&target, &all_nodes, &config);
+        assert_eq!(deps.len(), 2);
+        let dep_names: Vec<String> = deps.iter().map(|d| d.node.clone()).collect();
+        assert!(dep_names.contains(&"api".to_string()));
+        assert!(dep_names.contains(&"frontend".to_string()));
+    }
+
+    #[test]
+    fn test_get_dependents_middle_node() {
+        let config = make_config();
+        let all_nodes = vec![
+            NodeSelection {
+                node: "db".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "api".into(),
+                variant: "local".into(),
+            },
+            NodeSelection {
+                node: "frontend".into(),
+                variant: "local".into(),
+            },
+        ];
+        let target = NodeSelection {
+            node: "api".into(),
+            variant: "local".into(),
+        };
+        let deps = get_dependents(&target, &all_nodes, &config);
+        assert_eq!(deps.len(), 1);
+        assert_eq!(deps[0].node, "frontend");
+    }
+}
diff --git a/crates/veld-core/src/logging.rs b/crates/veld-core/src/logging.rs
index 27be4cd..b7f15fe 100644
--- a/crates/veld-core/src/logging.rs
+++ b/crates/veld-core/src/logging.rs
@@ -59,6 +59,12 @@ pub fn debug_log_file(project_root: &Path, run_name: &str) -> PathBuf {
     log_dir(project_root, run_name).join("veld-debug.log")
 }
 
+/// Return the internal (veld daemon/orchestrator) log file for a run.
+/// Contains liveness probe outcomes, recovery decisions, health transitions.
+pub fn internal_log_file(project_root: &Path, run_name: &str) -> PathBuf {
+    log_dir(project_root, run_name).join("_veld.log")
+}
+
 /// Return a temporary output file path for a command node.
 ///
 /// Scripts write `key=value` lines to this file instead of emitting
diff --git a/crates/veld-core/src/orchestrator.rs b/crates/veld-core/src/orchestrator.rs
index 2599a31..795f1fd 100644
--- a/crates/veld-core/src/orchestrator.rs
+++ b/crates/veld-core/src/orchestrator.rs
@@ -17,7 +17,7 @@ use crate::port::PortAllocator;
 use crate::process;
 use crate::progress::ProgressEvent;
 use crate::state::{
-    GlobalRegistry, HealthCheckPhase, NodeState, NodeStatus, ProjectState, RegistryEntry,
+    GlobalRegistry, NodeState, NodeStatus, ProjectState, ReadinessPhase, RegistryEntry,
     RegistryRunInfo, RunState, RunStatus,
 };
 use crate::url;
@@ -164,6 +164,8 @@ pub struct Orchestrator {
     foreground: bool,
     /// Optional channel for live progress events.
     progress_tx: Option<mpsc::UnboundedSender<ProgressEvent>>,
+    /// Internal log writer for the current run (liveness/recovery/lifecycle events).
+    internal_log: Option<LogWriter>,
 }
 
 impl Orchestrator {
@@ -183,6 +185,7 @@ impl Orchestrator {
             debug_writer: None,
             foreground: false,
             progress_tx: None,
+            internal_log: None,
         }
     }
 
@@ -220,6 +223,13 @@ impl Orchestrator {
         }
     }
 
+    /// Write a line to the internal log (per-run lifecycle events).
+    async fn internal_log(&self, message: &str) {
+        if let Some(ref writer) = self.internal_log {
+            let _ = writer.write_line(message).await;
+        }
+    }
+
     /// Convenience: discover config from CWD and build the orchestrator.
     pub fn from_cwd() -> Result<Self, OrchestratorError> {
         let (path, cfg) = config::load_config_from_cwd()?;
@@ -260,6 +270,12 @@ impl Orchestrator {
             }
         }
 
+        // Create internal log writer for this run.
+        let log_path = logging::internal_log_file(&self.project_root, run_name);
+        if let Ok(writer) = LogWriter::new(log_path).await {
+            self.internal_log = Some(writer);
+        }
+
         let resolved = graph::resolve_selections(selections, &self.config)?;
         let plan = graph::build_execution_plan(&resolved, &self.config)?;
 
@@ -394,6 +410,13 @@ impl Orchestrator {
 
         // Count total nodes for progress reporting.
         let total_nodes: usize = plan.iter().map(|s| s.len()).sum();
+        self.internal_log(&format!(
+            "[start] starting environment '{}' — {} node(s) in {} stage(s)",
+            run_name,
+            total_nodes,
+            plan.len()
+        ))
+        .await;
         self.emit(ProgressEvent::PlanResolved {
             total_nodes,
             stages: plan.len(),
@@ -436,6 +459,10 @@ impl Orchestrator {
         }
         .await;
 
+        if let Err(ref e) = execute_result {
+            self.internal_log(&format!("[start] startup failed: {e}"))
+                .await;
+        }
         if let Err(e) = execute_result {
             // Release all remaining port reservations so the ports become
             // available to the system immediately.
@@ -451,6 +478,12 @@ impl Orchestrator {
         // Final state save with Running status.
         self.save_state(&run)?;
 
+        self.internal_log(&format!(
+            "[start] environment '{}' is running — all {} node(s) healthy",
+            run_name, total_nodes
+        ))
+        .await;
+
         Ok(run)
     }
 
@@ -588,6 +621,16 @@ impl Orchestrator {
     /// Stop a run in reverse dependency order. Returns whether the run was
     /// actually stopped or was already stopped.
     pub async fn stop(&mut self, run_name: &str) -> Result<StopResult, OrchestratorError> {
+        // Create internal log writer for this run (may already exist from start).
+        if self.internal_log.is_none() {
+            let log_path = logging::internal_log_file(&self.project_root, run_name);
+            if let Ok(writer) = LogWriter::new(log_path).await {
+                self.internal_log = Some(writer);
+            }
+        }
+        self.internal_log(&format!("[stop] stopping environment '{run_name}'"))
+            .await;
+
         // Reconnect to whichever helper is running (system or user socket)
         if let Ok(client) = crate::helper::HelperClient::connect().await {
             self.helper_client = client;
@@ -633,6 +676,12 @@ impl Orchestrator {
 
         for key in node_keys.iter().rev() {
             if let Some(node_state) = run.nodes.get_mut(key) {
+                self.internal_log(&format!(
+                    "[stop] stopping {}:{} (pid: {:?})",
+                    node_state.node_name, node_state.variant, node_state.pid
+                ))
+                .await;
+
                 // Kill process if running.
                 if let Some(pid) = node_state.pid {
                     if process::is_alive(pid) {
@@ -678,6 +727,9 @@ impl Orchestrator {
         // Remove from global registry.
         self.remove_from_registry(run_name);
 
+        self.internal_log(&format!("[stop] environment '{run_name}' stopped"))
+            .await;
+
         Ok(StopResult::Stopped)
     }
 
@@ -1131,7 +1183,7 @@ async fn debug_log_free(writer: &Option<LogWriter>, message: &str) {
     }
 }
 
-/// Build a health-check attempt notifier that sends progress events.
+/// Build a readiness probe attempt notifier that sends progress events.
 fn make_attempt_notifier(
     tx: &Option<mpsc::UnboundedSender<ProgressEvent>>,
     node: &str,
@@ -1143,7 +1195,7 @@ fn make_attempt_notifier(
     let variant = variant.to_owned();
     Box::new(move |attempt| {
         if let Some(tx) = &tx {
-            let _ = tx.send(ProgressEvent::HealthCheckAttempt {
+            let _ = tx.send(ProgressEvent::ReadinessProbeAttempt {
                 node: node.clone(),
                 variant: variant.clone(),
                 phase,
@@ -1456,24 +1508,26 @@ async fn execute_start_server_isolated(
         let _ = project_state.save(&project_root);
     }
 
-    // Health check — inlined to emit progress events between phases.
+    // Readiness probe — inlined to emit progress events between phases.
     debug_log_free(
         &ctx.debug_writer,
         &format!(
-            "{}:{} — process started (pid {}), beginning health checks",
+            "{}:{} — process started (pid {}), beginning readiness checks",
             sel.node, sel.variant, pid
         ),
     )
     .await;
-    if let Some(ref hc) = variant_cfg.health_check {
+    // Use probes.readiness if available, falling back to legacy health_check.
+    if let Some(hc) = variant_cfg.readiness_probe() {
+        let hc = hc.clone();
         node_state.status = NodeStatus::HealthChecking;
-        node_state.health_phases.push(HealthCheckPhase {
+        node_state.readiness_phases.push(ReadinessPhase {
             phase: 1,
             passed: false,
             last_error: None,
             passed_at: None,
         });
-        node_state.health_phases.push(HealthCheckPhase {
+        node_state.readiness_phases.push(ReadinessPhase {
             phase: 2,
             passed: false,
             last_error: None,
@@ -1535,7 +1589,7 @@ async fn execute_start_server_isolated(
         // Phase 1: TCP port check.
         emit_progress(
             &ctx.progress_tx,
-            ProgressEvent::HealthCheckPhase {
+            ProgressEvent::ReadinessProbePhase {
                 node: sel.node.clone(),
                 variant: sel.variant.clone(),
                 phase: 1,
@@ -1544,7 +1598,7 @@ async fn execute_start_server_isolated(
         );
 
         let phase1_result = tokio::select! {
-            result = health::wait_for_port(port, hc, Some(&phase1_notifier)) => result,
+            result = health::wait_for_port(port, &hc, Some(&phase1_notifier)) => result,
             _ = wait_for_process_exit(pid) => {
                 Err(health::HealthError::PortCheckFailed(
                     "server process exited before binding to port".into(),
@@ -1555,11 +1609,11 @@ async fn execute_start_server_isolated(
         if let Err(e) = phase1_result {
             let msg = format!("process did not bind to port {port}: {e}");
             node_state.status = NodeStatus::Failed;
-            node_state.health_phases[0].last_error = Some(msg.clone());
+            node_state.readiness_phases[0].last_error = Some(msg.clone());
             debug_log_free(
                 &ctx.debug_writer,
                 &format!(
-                    "{}:{} — health check phase 1 FAILED: {}",
+                    "{}:{} — readiness phase 1 FAILED: {}",
                     sel.node, sel.variant, msg
                 ),
             )
@@ -1580,11 +1634,11 @@ async fn execute_start_server_isolated(
         }
 
         let now = chrono::Utc::now();
-        node_state.health_phases[0].passed = true;
-        node_state.health_phases[0].passed_at = Some(now);
+        node_state.readiness_phases[0].passed = true;
+        node_state.readiness_phases[0].passed_at = Some(now);
         emit_progress(
             &ctx.progress_tx,
-            ProgressEvent::HealthCheckPassed {
+            ProgressEvent::ReadinessProbePassed {
                 node: sel.node.clone(),
                 variant: sel.variant.clone(),
                 phase: 1,
@@ -1599,13 +1653,13 @@ async fn execute_start_server_isolated(
         // Phase 2: depends on check type.
         let phase2_desc = match hc.check_type.as_str() {
             "http" => format!("HTTP check on port {port}"),
-            "command" | "bash" => "command health check".to_owned(),
+            "command" | "bash" => "command readiness check".to_owned(),
             "port" => "port-only (no phase 2)".to_owned(),
             other => format!("unknown check type: {other}"),
         };
         emit_progress(
             &ctx.progress_tx,
-            ProgressEvent::HealthCheckPhase {
+            ProgressEvent::ReadinessProbePhase {
                 node: sel.node.clone(),
                 variant: sel.variant.clone(),
                 phase: 2,
@@ -1617,14 +1671,14 @@ async fn execute_start_server_isolated(
             match hc.check_type.as_str() {
                 "http" => {
                     let direct_url = format!("http://127.0.0.1:{port}");
-                    health::wait_for_http(&direct_url, hc, Some(&phase2_notifier)).await
+                    health::wait_for_http(&direct_url, &hc, Some(&phase2_notifier)).await
                 }
                 "command" | "bash" => {
                     if let Some(cmd) = &hc.command {
                         health::wait_for_command_check(
                             cmd,
                             &working_dir,
-                            hc,
+                            &hc,
                             Some(&phase2_notifier),
                         )
                         .await
@@ -1640,7 +1694,7 @@ async fn execute_start_server_isolated(
             result = phase2_future => result,
             _ = wait_for_process_exit(pid) => {
                 Err(health::HealthError::PortCheckFailed(
-                    "server process exited during health check".into(),
+                    "server process exited during readiness check".into(),
                 ))
             }
         };
@@ -1648,12 +1702,12 @@ async fn execute_start_server_isolated(
         match phase2_result {
             Ok(()) => {
                 let now = chrono::Utc::now();
-                node_state.health_phases[1].passed = true;
-                node_state.health_phases[1].passed_at = Some(now);
+                node_state.readiness_phases[1].passed = true;
+                node_state.readiness_phases[1].passed_at = Some(now);
                 node_state.status = NodeStatus::Healthy;
                 emit_progress(
                     &ctx.progress_tx,
-                    ProgressEvent::HealthCheckPassed {
+                    ProgressEvent::ReadinessProbePassed {
                         node: sel.node.clone(),
                         variant: sel.variant.clone(),
                         phase: 2,
@@ -1662,7 +1716,7 @@ async fn execute_start_server_isolated(
                 debug_log_free(
                     &ctx.debug_writer,
                     &format!(
-                        "{}:{} — health check passed, node is healthy",
+                        "{}:{} — readiness check passed, node is healthy",
                         sel.node, sel.variant
                     ),
                 )
@@ -1671,11 +1725,11 @@ async fn execute_start_server_isolated(
             Err(e) => {
                 node_state.status = NodeStatus::Failed;
                 let msg = e.to_string();
-                node_state.health_phases[1].last_error = Some(msg.clone());
+                node_state.readiness_phases[1].last_error = Some(msg.clone());
                 debug_log_free(
                     &ctx.debug_writer,
                     &format!(
-                        "{}:{} — health check phase 2 FAILED: {}",
+                        "{}:{} — readiness phase 2 FAILED: {}",
                         sel.node, sel.variant, msg
                     ),
                 )
@@ -1736,16 +1790,17 @@ async fn execute_command_isolated(
     );
     let env = build_env(merged_env.as_ref(), var_ctx)?;
 
-    // Verify step (idempotency).
-    if let Some(ref verify_cmd) = variant_cfg.verify {
-        let verify_resolved = crate::variables::interpolate(verify_cmd, var_ctx)?;
-        let verify_result = process::run_command(&verify_resolved, &working_dir, &env, None).await;
-        if let Ok(ref out) = verify_result {
+    // Idempotency check (skip_if).
+    if let Some(ref skip_if_cmd) = variant_cfg.skip_if {
+        let skip_if_resolved = crate::variables::interpolate(skip_if_cmd, var_ctx)?;
+        let skip_if_result =
+            process::run_command(&skip_if_resolved, &working_dir, &env, None).await;
+        if let Ok(ref out) = skip_if_result {
             if out.exit_code == 0 {
                 tracing::info!(
                     node = sel.node,
                     variant = sel.variant,
-                    "verify passed — skipping command step"
+                    "skip_if passed — skipping command step"
                 );
                 node_state.status = NodeStatus::Skipped;
                 node_state
@@ -1811,7 +1866,113 @@ async fn execute_command_isolated(
     }
 
     if result.exit_code == 0 {
-        node_state.status = NodeStatus::Healthy;
+        // Run readiness probe if configured (probes.readiness on command nodes).
+        if let Some(hc) = variant_cfg.readiness_probe() {
+            let hc = hc.clone();
+            node_state.status = NodeStatus::HealthChecking;
+            emit_progress(
+                &ctx.progress_tx,
+                ProgressEvent::ReadinessProbePhase {
+                    node: sel.node.clone(),
+                    variant: sel.variant.clone(),
+                    phase: 1,
+                    description: "readiness probe".to_owned(),
+                },
+            );
+
+            let notifier = make_attempt_notifier(&ctx.progress_tx, &sel.node, &sel.variant, 1);
+            let probe_result = match hc.check_type.as_str() {
+                "command" | "bash" => {
+                    if let Some(cmd) = &hc.command {
+                        health::wait_for_command_check(cmd, &working_dir, &hc, Some(&notifier))
+                            .await
+                    } else {
+                        Ok(())
+                    }
+                }
+                "port" => {
+                    // Port check — look for a port value in outputs.
+                    // Checks common key names; a future enhancement could add
+                    // an explicit `port_key` field to HealthCheck.
+                    let port_str = node_state
+                        .outputs
+                        .get("PORT")
+                        .or(node_state.outputs.get("DB_PORT"))
+                        .or(node_state.outputs.get("SERVICE_PORT"));
+                    if let Some(port_str) = port_str {
+                        if let Ok(port) = port_str.parse::<u16>() {
+                            health::wait_for_port(port, &hc, Some(&notifier)).await
+                        } else {
+                            tracing::warn!(
+                                node = sel.node,
+                                variant = sel.variant,
+                                "readiness port probe: output value is not a valid port number"
+                            );
+                            Ok(())
+                        }
+                    } else {
+                        tracing::warn!(
+                            node = sel.node,
+                            variant = sel.variant,
+                            "readiness port probe skipped: no PORT/DB_PORT/SERVICE_PORT output found"
+                        );
+                        Ok(())
+                    }
+                }
+                "http" => {
+                    // HTTP check — look for a URL value in outputs.
+                    let url = node_state
+                        .outputs
+                        .get("URL")
+                        .or(node_state.outputs.get("DATABASE_URL"))
+                        .or(node_state.outputs.get("SERVICE_URL"));
+                    if let Some(url) = url {
+                        health::wait_for_http(url, &hc, Some(&notifier)).await
+                    } else {
+                        tracing::warn!(
+                            node = sel.node,
+                            variant = sel.variant,
+                            "readiness http probe skipped: no URL/DATABASE_URL/SERVICE_URL output found"
+                        );
+                        Ok(())
+                    }
+                }
+                _ => Ok(()),
+            };
+
+            match probe_result {
+                Ok(()) => {
+                    node_state.status = NodeStatus::Healthy;
+                    emit_progress(
+                        &ctx.progress_tx,
+                        ProgressEvent::ReadinessProbePassed {
+                            node: sel.node.clone(),
+                            variant: sel.variant.clone(),
+                            phase: 1,
+                        },
+                    );
+                }
+                Err(e) => {
+                    node_state.status = NodeStatus::Failed;
+                    let reason = format!("readiness probe failed: {e}");
+                    emit_progress(
+                        &ctx.progress_tx,
+                        ProgressEvent::NodeFailed {
+                            node: sel.node.clone(),
+                            variant: sel.variant.clone(),
+                            error: reason.clone(),
+                        },
+                    );
+                    return Err(OrchestratorError::NodeFailed {
+                        node: sel.node.clone(),
+                        variant: sel.variant.clone(),
+                        reason,
+                    });
+                }
+            }
+        } else {
+            node_state.status = NodeStatus::Healthy;
+        }
     } else {
         node_state.status = NodeStatus::Failed;
         let reason = format!("command step exited with code {}", result.exit_code);
@@ -1838,7 +1999,7 @@ async fn execute_command_isolated(
 // ---------------------------------------------------------------------------
 
 /// Poll until a process is no longer alive. Checks every 250ms.
-/// Used to race health checks against premature process death so the
+/// Used to race readiness checks against premature process death so the
 /// orchestrator can fail fast instead of waiting for the full timeout.
 async fn wait_for_process_exit(pid: u32) {
     loop {
diff --git a/crates/veld-core/src/progress.rs b/crates/veld-core/src/progress.rs
index 5ed6272..0ad6724 100644
--- a/crates/veld-core/src/progress.rs
+++ b/crates/veld-core/src/progress.rs
@@ -25,24 +25,24 @@ pub enum ProgressEvent {
         port: u16,
     },
 
-    /// Health check phase started.
-    HealthCheckPhase {
+    /// Readiness probe phase started.
+    ReadinessProbePhase {
         node: String,
         variant: String,
         phase: u8,
         description: String,
     },
 
-    /// Health check attempt (retry) within a phase.
-    HealthCheckAttempt {
+    /// Readiness probe attempt (retry) within a phase.
+    ReadinessProbeAttempt {
         node: String,
         variant: String,
         phase: u8,
         attempt: u32,
     },
 
-    /// Health check phase passed.
-    HealthCheckPassed {
+    /// Readiness probe phase passed.
+    ReadinessProbePassed {
         node: String,
         variant: String,
         phase: u8,
@@ -56,7 +56,7 @@ pub enum ProgressEvent {
         elapsed_ms: u64,
     },
 
-    /// Node was skipped (verify command passed).
+    /// Node was skipped (skip_if command passed).
     NodeSkipped { node: String, variant: String },
 
     /// Node failed.
@@ -92,9 +92,9 @@ pub enum ProgressEvent {
     /// A teardown step completed.
     TeardownStepCompleted { name: String },
 
-    /// Service log lines streamed during slow health checks.
+    /// Service log lines streamed during slow readiness checks.
     ///
-    /// Emitted after a delay when health checks are taking longer than
+    /// Emitted after a delay when readiness checks are taking longer than
     /// expected, giving the user visibility into what the service is doing.
     NodeLogLines {
         node: String,
@@ -159,28 +159,28 @@ mod tests {
     }
 
     #[test]
-    fn test_health_check_phase_serialization() {
-        let event = ProgressEvent::HealthCheckPhase {
+    fn test_readiness_probe_phase_serialization() {
+        let event = ProgressEvent::ReadinessProbePhase {
             node: "api".into(),
             variant: "local".into(),
             phase: 1,
             description: "waiting for port 8080".into(),
         };
         let json = serde_json::to_string(&event).unwrap();
-        assert!(json.contains("\"type\":\"health_check_phase\""));
+        assert!(json.contains("\"type\":\"readiness_probe_phase\""));
         assert!(json.contains("\"phase\":1"));
     }
 
     #[test]
-    fn test_health_check_attempt_serialization() {
-        let event = ProgressEvent::HealthCheckAttempt {
+    fn test_readiness_probe_attempt_serialization() {
+        let event = ProgressEvent::ReadinessProbeAttempt {
             node: "api".into(),
             variant: "local".into(),
             phase: 1,
             attempt: 5,
         };
         let json = serde_json::to_string(&event).unwrap();
-        assert!(json.contains("\"type\":\"health_check_attempt\""));
+        assert!(json.contains("\"type\":\"readiness_probe_attempt\""));
         assert!(json.contains("\"phase\":1"));
         assert!(json.contains("\"attempt\":5"));
     }
diff --git a/crates/veld-core/src/state.rs b/crates/veld-core/src/state.rs
index 0742ffd..ee97eb7 100644
--- a/crates/veld-core/src/state.rs
+++ b/crates/veld-core/src/state.rs
@@ -44,6 +44,8 @@ pub enum StateError {
 pub enum RunStatus {
     Starting,
     Running,
+    /// A recovery cycle is in progress for one or more nodes.
+    Recovering,
     Stopping,
     Stopped,
     Failed,
@@ -60,17 +62,19 @@ pub enum NodeStatus {
     Starting,
     HealthChecking,
     Healthy,
+    /// Liveness probe failed but recovery has not yet been exhausted.
+    Unhealthy,
     Failed,
     Stopped,
     Skipped,
 }
 
 // ---------------------------------------------------------------------------
-// Health check phase tracking
+// Readiness phase tracking
 // ---------------------------------------------------------------------------
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HealthCheckPhase {
+pub struct ReadinessPhase {
     pub phase: u8, // 1 = port, 2 = HTTPS
     pub passed: bool,
     pub last_error: Option<String>,
@@ -91,7 +95,18 @@ pub struct NodeState {
     pub port: Option<u16>,
     pub url: Option<String>,
     pub outputs: HashMap<String, String>,
-    pub health_phases: Vec<HealthCheckPhase>,
+    /// Readiness probe phase tracking (renamed from `health_phases` in v7).
+    #[serde(default, alias = "health_phases")]
+    pub readiness_phases: Vec<ReadinessPhase>,
+    /// Number of recovery attempts completed for this node.
+    #[serde(default)]
+    pub recovery_count: u32,
+    /// Current streak of consecutive liveness probe failures.
+    #[serde(default)]
+    pub consecutive_failures: u32,
+    /// Error message from the most recent liveness probe failure.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub last_liveness_error: Option<String>,
     /// Output keys whose values are sensitive (encrypted at rest, masked in display).
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
     pub sensitive_keys: Vec<String>,
@@ -107,7 +122,10 @@ impl NodeState {
             port: None,
             url: None,
             outputs: HashMap::new(),
-            health_phases: Vec::new(),
+            readiness_phases: Vec::new(),
+            recovery_count: 0,
+            consecutive_failures: 0,
+            last_liveness_error: None,
             sensitive_keys: Vec::new(),
         }
     }
diff --git a/crates/veld-daemon/Cargo.toml b/crates/veld-daemon/Cargo.toml
index 209a2d3..4419129 100644
--- a/crates/veld-daemon/Cargo.toml
+++ b/crates/veld-daemon/Cargo.toml
@@ -22,4 +22,5 @@ chrono = { workspace = true }
 dirs = { workspace = true }
 libc = "0.2"
 axum = "0.8"
+reqwest = { workspace = true }
 uuid = { workspace = true }
diff --git a/crates/veld-daemon/assets/management-ui.html b/crates/veld-daemon/assets/management-ui.html
index fc2e67e..729f989 100644
--- a/crates/veld-daemon/assets/management-ui.html
+++ b/crates/veld-daemon/assets/management-ui.html
@@ -49,6 +49,7 @@
 .badge.stopped{background:rgba(85,88,112,.15);color:var(--dim)}
 .badge.starting,.badge.stopping,.badge.health_checking,.badge.pending{background:var(--yellow-bg);color:var(--yellow)}
 .badge.failed{background:var(--red-bg);color:var(--red)}
+.badge.recovering,.badge.unhealthy{background:var(--yellow-bg,#3d3800);color:var(--yellow)}
 .badge.skipped{background:rgba(85,88,112,.15);color:var(--dim)}
 
 .card-sub{display:flex;align-items:center;gap:8px;padding:0 18px 12px;font-size:12px;color:var(--text2)}
@@ -72,7 +73,11 @@
 .health-dot.healthy,.health-dot.running{background:var(--green)}
 .health-dot.starting,.health-dot.health_checking,.health-dot.pending{background:var(--yellow)}
 .health-dot.failed{background:var(--red)}
+.health-dot.unhealthy{background:var(--yellow)}
 .health-dot.stopped,.health-dot.skipped{background:var(--dim)}
+.svc-liveness{font-size:11px;color:var(--dim);padding-left:20px}
+.svc-liveness .warn{color:var(--yellow)}
+.svc-liveness .err{color:var(--red)}
 
 /* services table (merged URLs + debug) */
 .svc-table{width:100%;font-size:12px;border-collapse:collapse}
@@ -238,6 +243,15 @@
           h+='</td>';
           h+='<td class="svc-dim">'+esc(n.variant)+'</td>';
           h+='<td class="svc-pid">'+(n.pid||'')+'</td></tr>';
+          if(n.recovery_count||n.consecutive_failures||n.last_liveness_error){
+            h+='<tr><td colspan="5" class="svc-liveness">';
+            var parts=[];
+            if(n.consecutive_failures)parts.push('<span class="warn">failures: '+n.consecutive_failures+'</span>');
+            if(n.recovery_count)parts.push('recoveries: '+n.recovery_count);
+            if(n.last_liveness_error)parts.push('<span class="err">'+esc(n.last_liveness_error)+'</span>');
+            h+=parts.join(' &middot; ');
+            h+='</td></tr>';
+          }
         }
         h+='</tbody></table>';
       }else{h+='<div class="svc-empty">No services</div>';}
@@ -271,6 +285,7 @@
         '<button class="active" data-action="source-filter" data-source="all">All</button>'+
         '<button data-action="source-filter" data-source="server">Server</button>'+
         '<button data-action="source-filter" data-source="client">Client</button>'+
+        '<button data-action="source-filter" data-source="internal">Internal</button>'+
       '</div>'+
       '<input class="log-search" type="text" placeholder="Search logs..." data-action="log-search">'+
       '<span class="log-match-count"></span>'+
diff --git a/crates/veld-daemon/src/management.rs b/crates/veld-daemon/src/management.rs
index 4cb46a9..b7b7e64 100644
--- a/crates/veld-daemon/src/management.rs
+++ b/crates/veld-daemon/src/management.rs
@@ -71,6 +71,16 @@ struct NodeInfo {
     status: NodeStatus,
     url: Option<String>,
     pid: Option<u32>,
+    #[serde(skip_serializing_if = "is_zero")]
+    recovery_count: u32,
+    #[serde(skip_serializing_if = "is_zero")]
+    consecutive_failures: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    last_liveness_error: Option<String>,
+}
+
+fn is_zero(v: &u32) -> bool {
+    *v == 0
 }
 
 async fn list_environments() -> Result<Json<EnvironmentList>, StatusCode> {
@@ -102,6 +112,9 @@ async fn list_environments() -> Result<Json<EnvironmentList>, StatusCode> {
                                     status: ns.status.clone(),
                                     url: ns.url.clone(),
                                     pid: ns.pid,
+                                    recovery_count: ns.recovery_count,
+                                    consecutive_failures: ns.consecutive_failures,
+                                    last_liveness_error: ns.last_liveness_error.clone(),
                                 })
                                 .collect()
                         })
@@ -200,8 +213,30 @@ async fn get_logs(
     let lines_limit = q.lines.clamp(1, 5000);
     let include_server = q.source == "all" || q.source == "server";
     let include_client = q.source == "all" || q.source == "client";
+    let include_internal = q.source == "all" || q.source == "internal" || q.source == "veld";
     let mut nodes = Vec::new();
 
+    // Internal (veld daemon) log — not per-node, shown as _veld:internal.
+    if include_internal {
+        let log_path = logging::internal_log_file(&project_root, &run_name);
+        let lines = if log_path.exists() {
+            let raw = logging::tail_lines(&log_path, lines_limit)
+                .await
+                .unwrap_or_default();
+            logging::merge_continuation_lines(raw)
+        } else {
+            Vec::new()
+        };
+        if !lines.is_empty() {
+            nodes.push(NodeLogs {
+                node: "_veld".to_owned(),
+                variant: "internal".to_owned(),
+                source: "internal".to_owned(),
+                lines,
+            });
+        }
+    }
+
     for ns in run_state.nodes.values() {
         if let Some(ref filter) = q.node {
             if ns.node_name != *filter {
diff --git a/crates/veld-daemon/src/monitor.rs b/crates/veld-daemon/src/monitor.rs
index 500563a..8d0a27b 100644
--- a/crates/veld-daemon/src/monitor.rs
+++ b/crates/veld-daemon/src/monitor.rs
@@ -1,20 +1,78 @@
 use crate::broadcaster::Broadcaster;
+use std::collections::HashMap;
+use std::path::Path;
+use std::time::{Duration, Instant};
 use tracing::{debug, info, warn};
-use veld_core::state::{GlobalRegistry, ProjectState, RunStatus};
+use veld_core::config::{self, LivenessProbe, VeldConfig};
+use veld_core::logging::{self, LogWriter};
+use veld_core::state::{GlobalRegistry, NodeStatus, ProjectState, RunStatus};
 
 /// Interval between health-check scans (seconds).
 const SCAN_INTERVAL_SECS: u64 = 5;
 
+/// Tracks when each node's liveness probe was last executed.
+/// Key: `"project_root:run_name:node:variant"`.
+type LastCheckMap = HashMap<String, Instant>;
+
+/// Resolve the user's full PATH by spawning an interactive login shell.
+/// Falls back to the current process PATH if resolution fails.
+fn resolve_user_path() -> String {
+    let shell = std::env::var("SHELL").unwrap_or_else(|_| "sh".to_owned());
+    // Use -l -i -c to get a fully initialized interactive login shell.
+    // This captures PATH after .zprofile/.bash_profile/brew shellenv etc.
+    let output = std::process::Command::new(&shell)
+        .arg("-l")
+        .arg("-i")
+        .arg("-c")
+        .arg("echo $PATH")
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::null())
+        .output();
+
+    match output {
+        Ok(o) if o.status.success() => {
+            let path = String::from_utf8_lossy(&o.stdout).trim().to_owned();
+            if !path.is_empty() {
+                info!(path = %path, "resolved user PATH from login shell");
+                return path;
+            }
+        }
+        Ok(o) => {
+            debug!(
+                exit_code = o.status.code(),
+                "login shell PATH resolution exited non-zero, using fallback"
+            );
+        }
+        Err(e) => {
+            debug!(error = %e, "failed to resolve user PATH, using fallback");
+        }
+    }
+
+    std::env::var("PATH").unwrap_or_default()
+}
+
 /// Periodically scan all runs from the global registry and check process health.
 /// When a status change is detected, update the registry and broadcast the event.
 pub async fn run_health_monitor(broadcaster: Broadcaster) {
     let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(SCAN_INTERVAL_SECS));
+    let mut last_checks: LastCheckMap = HashMap::new();
+
+    // Resolve the user's full PATH once at startup so probe commands can
+    // find tools like pg_isready even when the daemon starts at boot.
+    let mut user_path = resolve_user_path();
+    let mut path_resolved_at = Instant::now();
 
     loop {
         interval.tick().await;
         debug!("running health-check scan");
 
-        match scan_and_update(&broadcaster).await {
+        // Re-resolve PATH every 60s to pick up changes after user login.
+        if path_resolved_at.elapsed() > Duration::from_secs(60) {
+            user_path = resolve_user_path();
+            path_resolved_at = Instant::now();
+        }
+
+        match scan_and_update(&broadcaster, &mut last_checks, &user_path).await {
             Ok(changes) => {
                 if changes > 0 {
                     info!("health scan detected {changes} status change(s)");
@@ -29,7 +87,11 @@ pub async fn run_health_monitor(broadcaster: Broadcaster) {
 
 /// Scan the global registry, check each running process, and return the number
 /// of status changes applied.
-async fn scan_and_update(broadcaster: &Broadcaster) -> anyhow::Result<usize> {
+async fn scan_and_update(
+    broadcaster: &Broadcaster,
+    last_checks: &mut LastCheckMap,
+    user_path: &str,
+) -> anyhow::Result<usize> {
     let registry = GlobalRegistry::load()?;
 
     let mut changes = 0;
@@ -120,15 +182,547 @@ async fn scan_and_update(broadcaster: &Broadcaster) -> anyhow::Result<usize> {
                 broadcaster.broadcast(&event).await;
 
                 changes += 1;
+                continue; // Skip liveness checks for a run that just stopped.
             }
+
+            // --- Liveness probe checks ---
+            // Load the project config to access probe definitions.
+            let config = match load_config_for_project(project_root) {
+                Some(c) => c,
+                None => continue,
+            };
+
+            // Create internal log writer for this run.
+            let log_path = logging::internal_log_file(project_root, run_name);
+            let internal_log = LogWriter::new(log_path).await.ok();
+
+            changes += run_liveness_checks(
+                project_root,
+                run_name,
+                &config,
+                broadcaster,
+                last_checks,
+                internal_log.as_ref(),
+                user_path,
+            )
+            .await;
         }
     }
 
     Ok(changes)
 }
 
+/// Run liveness probes for all healthy nodes in a run. Returns number of state changes.
+async fn run_liveness_checks(
+    project_root: &Path,
+    run_name: &str,
+    config: &VeldConfig,
+    broadcaster: &Broadcaster,
+    last_checks: &mut LastCheckMap,
+    internal_log: Option<&LogWriter>,
+    user_path: &str,
+) -> usize {
+    // Reload state fresh for liveness checks.
+    let mut project_state = match ProjectState::load(project_root) {
+        Ok(ps) => ps,
+        Err(_) => return 0,
+    };
+
+    let run = match project_state.get_run_mut(run_name) {
+        Some(r) => r,
+        None => return 0,
+    };
+
+    let mut changes = 0;
+
+    // Collect nodes to check — both Healthy and Unhealthy nodes get probed.
+    // Unhealthy nodes can recover if probes start passing again.
+    let nodes_to_check: Vec<(String, String, String)> = run
+        .nodes
+        .iter()
+        .filter(|(_, ns)| ns.status == NodeStatus::Healthy || ns.status == NodeStatus::Unhealthy)
+        .map(|(key, ns)| (key.clone(), ns.node_name.clone(), ns.variant.clone()))
+        .collect();
+
+    for (key, node_name, variant_name) in &nodes_to_check {
+        let node_cfg = match config.nodes.get(node_name) {
+            Some(c) => c,
+            None => continue,
+        };
+        let variant_cfg = match node_cfg.variants.get(variant_name) {
+            Some(c) => c,
+            None => continue,
+        };
+
+        let liveness = match variant_cfg.liveness_probe() {
+            Some(lp) => lp,
+            None => continue,
+        };
+
+        // Respect per-probe interval_ms — skip if not enough time has elapsed.
+        let check_key = format!("{}:{}:{}", project_root.to_string_lossy(), run_name, key);
+        let probe_interval = Duration::from_millis(liveness.interval_ms);
+        if let Some(last) = last_checks.get(&check_key) {
+            if last.elapsed() < probe_interval {
+                continue;
+            }
+        }
+        last_checks.insert(check_key, Instant::now());
+
+        // Run a single liveness check attempt.
+        let working_dir = config::resolve_cwd(
+            project_root,
+            node_cfg.cwd.as_deref(),
+            variant_cfg.cwd.as_deref(),
+        );
+
+        let node_label = format!("{node_name}:{variant_name}");
+
+        if let Some(log) = internal_log {
+            let _ = log
+                .write_line(&format!(
+                    "[liveness] {node_label} — running probe (type: {})",
+                    liveness.check_type
+                ))
+                .await;
+        }
+
+        let check_result =
+            run_single_liveness_check(liveness, &working_dir, run, key, user_path).await;
+
+        let node_state = match run.nodes.get_mut(key) {
+            Some(ns) => ns,
+            None => continue,
+        };
+
+        match check_result {
+            Ok(()) => {
+                if let Some(log) = internal_log {
+                    let _ = log
+                        .write_line(&format!("[liveness] {node_label} — probe passed"))
+                        .await;
+                }
+                // Reset failure counter on success.
+                if node_state.consecutive_failures > 0 || node_state.status == NodeStatus::Unhealthy
+                {
+                    node_state.consecutive_failures = 0;
+                    node_state.last_liveness_error = None;
+                    // Transition Unhealthy -> Healthy (probe started passing again).
+                    if node_state.status == NodeStatus::Unhealthy {
+                        node_state.status = NodeStatus::Healthy;
+                        info!(
+                            node = node_name.as_str(),
+                            variant = variant_name.as_str(),
+                            "node self-healed — transitioning from unhealthy to healthy"
+                        );
+                        if let Some(log) = internal_log {
+                            let _ = log
+                                .write_line(&format!(
+                                    "[liveness] {node_label} — self-healed, back to healthy"
+                                ))
+                                .await;
+                        }
+                    }
+                    changes += 1;
+                }
+            }
+            Err(error_detail) => {
+                node_state.consecutive_failures += 1;
+                node_state.last_liveness_error = Some(error_detail.clone());
+                changes += 1;
+
+                info!(
+                    node = node_name.as_str(),
+                    variant = variant_name.as_str(),
+                    consecutive_failures = node_state.consecutive_failures,
+                    threshold = liveness.failure_threshold,
+                    "liveness probe failed"
+                );
+
+                if let Some(log) = internal_log {
+                    let _ = log
+                    .write_line(&format!(
+                        "[liveness] {node_label} — probe failed ({}/{} consecutive): {error_detail}",
+                        node_state.consecutive_failures, liveness.failure_threshold
+                    ))
+                    .await;
+                }
+
+                // Check if failure threshold is reached.
+                if node_state.consecutive_failures >= liveness.failure_threshold {
+                    if node_state.recovery_count >= liveness.max_recoveries {
+                        // Exhausted — permanently fail.
+                        node_state.status = NodeStatus::Failed;
+                        warn!(
+                            node = node_name.as_str(),
+                            variant = variant_name.as_str(),
+                            max_recoveries = liveness.max_recoveries,
+                            "recovery exhausted — node permanently failed"
+                        );
+
+                        if let Some(log) = internal_log {
+                            let _ = log
+                                .write_line(&format!(
+                                    "[recovery] {node_label} — permanently failed after {} recovery attempts",
+                                    liveness.max_recoveries
+                                ))
+                                .await;
+                        }
+
+                        let event = serde_json::json!({
+                            "event": "recovery_exhausted",
+                            "run": run_name,
+                            "project": project_root.to_string_lossy(),
+                            "node": node_name,
+                            "variant": variant_name,
+                            "max_recoveries": liveness.max_recoveries,
+                            "timestamp": chrono::Utc::now().to_rfc3339(),
+                        });
+                        broadcaster.broadcast(&event).await;
+                    } else {
+                        // Trigger restart.
+                        let new_recovery_count = node_state.recovery_count + 1;
+
+                        info!(
+                            node = node_name.as_str(),
+                            variant = variant_name.as_str(),
+                            attempt = new_recovery_count,
+                            max = liveness.max_recoveries,
+                            "triggering recovery restart"
+                        );
+
+                        if let Some(log) = internal_log {
+                            let _ = log
+                                .write_line(&format!(
+                                    "[recovery] {node_label} — restarting environment (attempt {new_recovery_count}/{})",
+                                    liveness.max_recoveries
+                                ))
+                                .await;
+                        }
+
+                        let event = serde_json::json!({
+                            "event": "recovery_starting",
+                            "run": run_name,
+                            "project": project_root.to_string_lossy(),
+                            "node": node_name,
+                            "variant": variant_name,
+                            "attempt": new_recovery_count,
+                            "max_recoveries": liveness.max_recoveries,
+                            "timestamp": chrono::Utc::now().to_rfc3339(),
+                        });
+                        broadcaster.broadcast(&event).await;
+
+                        // Save state BEFORE restart so recovery_count is persisted.
+                        // Don't set status to Unhealthy — the restart will create
+                        // fresh Healthy state. We only need recovery_count to survive.
+                        node_state.recovery_count = new_recovery_count;
+                        node_state.consecutive_failures = 0;
+                        let _ = project_state.save(project_root);
+
+                        // Run the restart. This stops+starts the entire environment,
+                        // creating fresh node state with recovery_count: 0.
+                        run_veld_restart(project_root, run_name, internal_log, user_path).await;
+
+                        // Restore recovery_count on the fresh state so it accumulates
+                        // across restarts and eventually hits max_recoveries.
+                        if let Ok(mut fresh_state) = ProjectState::load(project_root) {
+                            if let Some(fresh_run) = fresh_state.get_run_mut(run_name) {
+                                if let Some(fresh_node) = fresh_run.nodes.get_mut(key) {
+                                    fresh_node.recovery_count = new_recovery_count;
+                                }
+                            }
+                            let _ = fresh_state.save(project_root);
+                        }
+
+                        // Return early — don't save stale in-memory state over
+                        // the fresh state created by the restart.
+                        return changes;
+                    }
+                }
+            }
+        }
+    }
+
+    // Persist any state changes (failure counts, etc.).
+    if changes > 0 {
+        let _ = project_state.save(project_root);
+    }
+
+    changes
+}
+
+/// Run a single liveness check for a node.
+/// Returns `Ok(())` if healthy, `Err(reason)` with details if unhealthy.
+async fn run_single_liveness_check(
+    liveness: &LivenessProbe,
+    working_dir: &Path,
+    run: &veld_core::state::RunState,
+    node_key: &str,
+    user_path: &str,
+) -> Result<(), String> {
+    let node_state = match run.nodes.get(node_key) {
+        Some(ns) => ns,
+        None => return Ok(()),
+    };
+
+    match liveness.check_type.as_str() {
+        "command" | "bash" => {
+            if let Some(ref cmd) = liveness.command {
+                // Timeout command checks to prevent hanging the monitor loop.
+                // Inject the resolved user PATH so probes find tools like
+                // pg_isready even when the daemon starts at boot.
+                let result = tokio::time::timeout(Duration::from_secs(30), async {
+                    let mut command = tokio::process::Command::new("sh");
+                    command
+                        .arg("-c")
+                        .arg(cmd)
+                        .current_dir(working_dir)
+                        .stdout(std::process::Stdio::null())
+                        .stderr(std::process::Stdio::piped())
+                        .env("PATH", user_path);
+                    // Inject node outputs as environment variables so probe
+                    // commands can reference them (e.g., pg_isready -h $DB_HOST).
+                    for (key, value) in &node_state.outputs {
+                        command.env(key, value);
+                    }
+                    command.output().await
+                })
+                .await;
+
+                match result {
+                    Ok(Ok(output)) if output.status.success() => Ok(()),
+                    Ok(Ok(output)) => {
+                        let stderr = String::from_utf8_lossy(&output.stderr);
+                        let stderr = stderr.trim();
+                        let code = output.status.code().unwrap_or(-1);
+                        if stderr.is_empty() {
+                            Err(format!("exit code {code}"))
+                        } else {
+                            Err(format!("exit code {code}: {stderr}"))
+                        }
+                    }
+                    Ok(Err(e)) => Err(format!("exec error: {e}")),
+                    Err(_) => Err("command timed out (30s)".to_owned()),
+                }
+            } else {
+                Ok(()) // No command configured, consider healthy.
+            }
+        }
+        "port" => {
+            if let Some(port) = node_state.port {
+                let addr: std::net::SocketAddr = ([127, 0, 0, 1], port).into();
+                match tokio::time::timeout(
+                    Duration::from_secs(5),
+                    tokio::net::TcpStream::connect(addr),
+                )
+                .await
+                {
+                    Ok(Ok(_)) => Ok(()),
+                    Ok(Err(e)) => Err(format!("port {port} connection failed: {e}")),
+                    Err(_) => Err(format!("port {port} connection timed out")),
+                }
+            } else {
+                Ok(()) // No port known, skip.
+            }
+        }
+        "http" => {
+            if let Some(port) = node_state.port {
+                let path = liveness.path.as_deref().unwrap_or("/");
+                let path = if path.starts_with('/') {
+                    path.to_owned()
+                } else {
+                    format!("/{path}")
+                };
+                let url = format!("http://127.0.0.1:{port}{path}");
+                let expected = liveness.expect_status.unwrap_or(200);
+
+                let client = match reqwest::Client::builder()
+                    .timeout(Duration::from_secs(5))
+                    .build()
+                {
+                    Ok(c) => c,
+                    Err(e) => return Err(format!("http client error: {e}")),
+                };
+
+                match client.get(&url).send().await {
+                    Ok(resp) => {
+                        let status = resp.status().as_u16();
+                        if status == expected {
+                            Ok(())
+                        } else {
+                            Err(format!("http status {status} (expected {expected})"))
+                        }
+                    }
+                    Err(e) => Err(format!("http request failed: {e}")),
+                }
+            } else {
+                Ok(()) // No port known, skip.
+            }
+        }
+        other => {
+            warn!(
+                check_type = other,
+                "unknown liveness probe type — treating as healthy"
+            );
+            Ok(())
+        }
+    }
+}
+
+/// Load the VeldConfig for a project root, if a veld.json exists.
+fn load_config_for_project(project_root: &Path) -> Option<VeldConfig> {
+    let config_path = project_root.join("veld.json");
+    if !config_path.exists() {
+        return None;
+    }
+    config::load_config(&config_path).ok()
+}
+
+/// Find the veld CLI binary path.
+/// Checks: next to daemon binary, `~/.local/bin/veld`, then falls back to PATH.
+fn find_veld_binary() -> std::path::PathBuf {
+    // 1. Same directory as daemon binary.
+    if let Some(sibling) = std::env::current_exe()
+        .ok()
+        .and_then(|p| p.parent().map(|d| d.join("veld")))
+        .filter(|p| p.exists())
+    {
+        return sibling;
+    }
+
+    // 2. Standard user install location.
+    if let Some(home) = dirs::home_dir() {
+        let user_bin = home.join(".local/bin/veld");
+        if user_bin.exists() {
+            return user_bin;
+        }
+    }
+
+    // 3. System paths.
+    for path in ["/usr/local/bin/veld", "/usr/bin/veld"] {
+        let p = std::path::PathBuf::from(path);
+        if p.exists() {
+            return p;
+        }
+    }
+
+    // 4. Fall back to PATH lookup.
+    std::path::PathBuf::from("veld")
+}
+
+/// Run `veld restart --name <run>` and wait for completion.
+/// Captures stdout/stderr and logs the result.
+async fn run_veld_restart(
+    project_root: &Path,
+    run_name: &str,
+    internal_log: Option<&LogWriter>,
+    user_path: &str,
+) {
+    let veld_bin = find_veld_binary();
+
+    info!(
+        run = run_name,
+        bin = %veld_bin.display(),
+        "running veld restart"
+    );
+
+    if let Some(log) = internal_log {
+        let _ = log
+            .write_line(&format!(
+                "[recovery] running: {} restart --name {}",
+                veld_bin.display(),
+                run_name
+            ))
+            .await;
+    }
+
+    let result = tokio::time::timeout(
+        Duration::from_secs(300), // 5 min timeout for full restart
+        tokio::process::Command::new(&veld_bin)
+            .arg("restart")
+            .arg("--name")
+            .arg(run_name)
+            .current_dir(project_root)
+            .env("PATH", user_path)
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .output(),
+    )
+    .await;
+
+    match result {
+        Ok(Ok(output)) => {
+            let code = output.status.code().unwrap_or(-1);
+            let stdout = String::from_utf8_lossy(&output.stdout);
+            let stderr = String::from_utf8_lossy(&output.stderr);
+
+            if output.status.success() {
+                info!(run = run_name, "veld restart completed successfully");
+                if let Some(log) = internal_log {
+                    let _ = log
+                        .write_line(&format!(
+                            "[recovery] veld restart completed (exit code {code})"
+                        ))
+                        .await;
+                    if !stdout.trim().is_empty() {
+                        for line in stdout.trim().lines() {
+                            let _ = log.write_line(&format!("[recovery]   {line}")).await;
+                        }
+                    }
+                }
+            } else {
+                warn!(run = run_name, exit_code = code, "veld restart failed");
+                if let Some(log) = internal_log {
+                    let _ = log
+                        .write_line(&format!(
+                            "[recovery] veld restart FAILED (exit code {code})"
+                        ))
+                        .await;
+                    if !stdout.trim().is_empty() {
+                        for line in stdout.trim().lines() {
+                            let _ = log
+                                .write_line(&format!("[recovery]   stdout: {line}"))
+                                .await;
+                        }
+                    }
+                    if !stderr.trim().is_empty() {
+                        for line in stderr.trim().lines() {
+                            let _ = log
+                                .write_line(&format!("[recovery]   stderr: {line}"))
+                                .await;
+                        }
+                    }
+                }
+            }
+        }
+        Ok(Err(e)) => {
+            warn!(
+                run = run_name,
+                bin = %veld_bin.display(),
+                error = %e,
+                "failed to execute veld restart"
+            );
+            if let Some(log) = internal_log {
+                let _ = log
+                    .write_line(&format!("[recovery] failed to execute veld restart: {e}"))
+                    .await;
+            }
+        }
+        Err(_) => {
+            warn!(run = run_name, "veld restart timed out (300s)");
+            if let Some(log) = internal_log {
+                let _ = log
+                    .write_line("[recovery] veld restart timed out (300s)")
+                    .await;
+            }
+        }
+    }
+}
+
 /// Check whether a given PID is alive by sending signal 0.
 fn is_process_alive(pid: u32) -> bool {
-    // On Unix, sending signal 0 checks for process existence.
+    let Some(pid) = i32::try_from(pid).ok().filter(|&p| p > 0) else {
+        return false;
+    };
     unsafe { libc::kill(pid as libc::pid_t, 0) == 0 }
 }
diff --git a/crates/veld/src/commands/init.rs b/crates/veld/src/commands/init.rs
index a4cb865..d1e5707 100644
--- a/crates/veld/src/commands/init.rs
+++ b/crates/veld/src/commands/init.rs
@@ -9,8 +9,8 @@ use crate::output;
 // ---------------------------------------------------------------------------
 
 const INIT_TEMPLATE: &str = r#"{
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "my-project",
   "url_template": "{service}.{run}.{project}.localhost",
   "presets": {
@@ -510,7 +510,7 @@ fn generate_veld_json(
 ) -> String {
     let mut json = String::new();
     json.push_str("{\n");
-    json.push_str("  \"$schema\": \"https://veld.oss.life.li/schema/v1/veld.schema.json\",\n");
+    json.push_str("  \"$schema\": \"https://veld.oss.life.li/schema/v2/veld.schema.json\",\n");
     json.push_str("  \"schemaVersion\": \"1\",\n");
     json.push_str(&format!("  \"name\": \"{}\",\n", escape_json(project_name)));
     json.push_str(&format!(
@@ -563,7 +563,7 @@ fn generate_veld_json(
             "          \"command\": \"{}\",\n",
             escape_json(command)
         ));
-        node.push_str("          \"health_check\": { \"type\": \"port\" }");
+        node.push_str("          \"probes\": { \"readiness\": { \"type\": \"port\" } }");
 
         // Add depends_on if any
         let service_deps: Vec<&String> = deps
@@ -853,8 +853,8 @@ pub async fn run() -> i32 {
         // No services detected/selected: write basic template with project name
         format!(
             r#"{{
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "{}",
   "url_template": "{}",
   "presets": {{
diff --git a/crates/veld/src/commands/logs.rs b/crates/veld/src/commands/logs.rs
index e27e6ab..643f95f 100644
--- a/crates/veld/src/commands/logs.rs
+++ b/crates/veld/src/commands/logs.rs
@@ -15,6 +15,7 @@ pub enum SourceFilter {
     All,
     Server,
     Client,
+    Internal,
 }
 
 impl SourceFilter {
@@ -23,6 +24,7 @@ impl SourceFilter {
             "all" => Some(Self::All),
             "server" => Some(Self::Server),
             "client" => Some(Self::Client),
+            "internal" | "veld" => Some(Self::Internal),
             _ => None,
         }
     }
@@ -106,22 +108,31 @@ pub async fn run(opts: LogsOptions) -> i32 {
 
     // Build list of (path, node, variant, source_label) for each log file to read.
     let mut log_sources: Vec<(PathBuf, &str, &str, &str)> = Vec::new();
-    for (node_name, variant) in &targets {
-        if source != SourceFilter::Client {
-            log_sources.push((
-                logging::log_file(&project_root, run_name, node_name, variant),
-                node_name,
-                variant,
-                "server",
-            ));
-        }
-        if source != SourceFilter::Server {
-            log_sources.push((
-                logging::client_log_file(&project_root, run_name, node_name, variant),
-                node_name,
-                variant,
-                "client",
-            ));
+
+    // Internal (veld daemon) log — not per-node, shown when source is "all" or "internal".
+    if source == SourceFilter::All || source == SourceFilter::Internal {
+        let internal_path = logging::internal_log_file(&project_root, run_name);
+        log_sources.push((internal_path, "_veld", "internal", "internal"));
+    }
+
+    if source != SourceFilter::Internal {
+        for (node_name, variant) in &targets {
+            if source != SourceFilter::Client {
+                log_sources.push((
+                    logging::log_file(&project_root, run_name, node_name, variant),
+                    node_name,
+                    variant,
+                    "server",
+                ));
+            }
+            if source != SourceFilter::Server {
+                log_sources.push((
+                    logging::client_log_file(&project_root, run_name, node_name, variant),
+                    node_name,
+                    variant,
+                    "client",
+                ));
+            }
         }
     }
 
@@ -299,22 +310,35 @@ async fn follow_logs(
 
     // Build list of (path, node, variant, source_label) to follow.
     let mut follow_sources: Vec<(PathBuf, String, String, String)> = Vec::new();
-    for (node_name, variant) in targets {
-        if source != SourceFilter::Client {
-            follow_sources.push((
-                logging::log_file(project_root, run_name, node_name, variant),
-                node_name.to_string(),
-                variant.to_string(),
-                "server".to_string(),
-            ));
-        }
-        if source != SourceFilter::Server {
-            follow_sources.push((
-                logging::client_log_file(project_root, run_name, node_name, variant),
-                node_name.to_string(),
-                variant.to_string(),
-                "client".to_string(),
-            ));
+
+    // Internal log.
+    if source == SourceFilter::All || source == SourceFilter::Internal {
+        follow_sources.push((
+            logging::internal_log_file(project_root, run_name),
+            "_veld".to_string(),
+            "internal".to_string(),
+            "internal".to_string(),
+        ));
+    }
+
+    if source != SourceFilter::Internal {
+        for (node_name, variant) in targets {
+            if source != SourceFilter::Client {
+                follow_sources.push((
+                    logging::log_file(project_root, run_name, node_name, variant),
+                    node_name.to_string(),
+                    variant.to_string(),
+                    "server".to_string(),
+                ));
+            }
+            if source != SourceFilter::Server {
+                follow_sources.push((
+                    logging::client_log_file(project_root, run_name, node_name, variant),
+                    node_name.to_string(),
+                    variant.to_string(),
+                    "client".to_string(),
+                ));
+            }
         }
     }
 
diff --git a/crates/veld/src/commands/start.rs b/crates/veld/src/commands/start.rs
index 8728ae7..65c4142 100644
--- a/crates/veld/src/commands/start.rs
+++ b/crates/veld/src/commands/start.rs
@@ -420,7 +420,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) {
                 state.redraw(ctx.total, "starting...");
             }
         }
-        ProgressEvent::HealthCheckPhase {
+        ProgressEvent::ReadinessProbePhase {
             node,
             variant,
             phase,
@@ -433,7 +433,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) {
                 state.redraw(ctx.total, "");
             }
         }
-        ProgressEvent::HealthCheckAttempt {
+        ProgressEvent::ReadinessProbeAttempt {
             node,
             variant,
             phase: _,
@@ -444,7 +444,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) {
                 state.redraw(ctx.total, &format!("attempt {attempt}"));
             }
         }
-        ProgressEvent::HealthCheckPassed {
+        ProgressEvent::ReadinessProbePassed {
             node: _,
             variant: _,
             phase: _,
@@ -479,7 +479,7 @@ fn render_progress_tty(event: &ProgressEvent, ctx: &mut TtyProgressCtx) {
                 "  {} {} {}",
                 output::dim("~"),
                 output::pad_right(&key, 30),
-                output::dim("skipped (verify passed)"),
+                output::dim("skipped (skip_if passed)"),
             );
             if let Some(state) = ctx.bars.remove(&key) {
                 state.bar.finish_with_message(finish_msg);
diff --git a/crates/veld/src/commands/status.rs b/crates/veld/src/commands/status.rs
index 3a4548f..0f9a1f5 100644
--- a/crates/veld/src/commands/status.rs
+++ b/crates/veld/src/commands/status.rs
@@ -105,6 +105,46 @@ pub async fn run(name: Option<String>, show_outputs: bool, json: bool) -> i32 {
 
         output::print_table(&["NODE", "VARIANT", "STATUS", "URL"], &rows);
 
+        // Show liveness/recovery details for nodes that have them.
+        let has_liveness_info = run_state.nodes.values().any(|ns| {
+            ns.recovery_count > 0
+                || ns.consecutive_failures > 0
+                || ns.last_liveness_error.is_some()
+                || ns.status == NodeStatus::Unhealthy
+        });
+        if has_liveness_info {
+            println!();
+            println!("{}", output::bold("Liveness:"));
+            for key in &node_keys {
+                let ns = &run_state.nodes[*key];
+                if ns.recovery_count == 0
+                    && ns.consecutive_failures == 0
+                    && ns.last_liveness_error.is_none()
+                    && ns.status != NodeStatus::Unhealthy
+                {
+                    continue;
+                }
+                println!();
+                println!(
+                    "  {}",
+                    output::cyan(&format!("{}:{}", ns.node_name, ns.variant))
+                );
+                if ns.consecutive_failures > 0 {
+                    println!(
+                        "    {} consecutive failures: {}",
+                        output::yellow("!"),
+                        ns.consecutive_failures
+                    );
+                }
+                if ns.recovery_count > 0 {
+                    println!("    {} recoveries: {}", output::dim("↻"), ns.recovery_count);
+                }
+                if let Some(ref err) = ns.last_liveness_error {
+                    println!("    {} last error: {}", output::dim("→"), err);
+                }
+            }
+        }
+
         // Show outputs per node when --outputs is passed.
         if show_outputs {
             println!();
@@ -174,6 +214,7 @@ fn format_run_status(status: &RunStatus) -> String {
         RunStatus::Stopping => output::yellow("stopping"),
         RunStatus::Stopped => output::dim("stopped"),
         RunStatus::Failed => output::red("failed"),
+        RunStatus::Recovering => output::yellow("recovering"),
     }
 }
 
@@ -192,5 +233,6 @@ fn format_node_status(status: &NodeStatus) -> String {
         NodeStatus::Stopped => format!("{} {}", output::dim("-"), output::dim("stopped")),
         NodeStatus::Failed => format!("{} {}", output::cross(), output::red("failed")),
         NodeStatus::Skipped => format!("{} {}", output::dim("-"), output::dim("skipped")),
+        NodeStatus::Unhealthy => format!("{} {}", output::cross(), output::yellow("unhealthy")),
     }
 }
diff --git a/crates/veld/src/commands/update.rs b/crates/veld/src/commands/update.rs
index 6fb0b4c..1ecebf5 100644
--- a/crates/veld/src/commands/update.rs
+++ b/crates/veld/src/commands/update.rs
@@ -1,3 +1,9 @@
+use std::io::Write;
+
+use veld_core::config;
+use veld_core::orchestrator::Orchestrator;
+use veld_core::state::{GlobalRegistry, ProjectState, RunStatus};
+
 use crate::output;
 
 /// `veld update` -- update Veld to the latest version.
@@ -8,6 +14,42 @@ pub async fn run() -> i32 {
 
     match veld_core::setup::check_update().await {
         Ok(Some(new_version)) => {
+            // Check for running environments and stop them before updating.
+            let running = find_running_environments();
+            if !running.is_empty() {
+                println!();
+                output::print_info(&format!(
+                    "Found {} running environment(s) that must be stopped before updating:",
+                    running.len()
+                ));
+                for (project, run_name) in &running {
+                    println!(
+                        "  {} {}",
+                        output::cyan(run_name),
+                        output::dim(&format!("({})", project.display()))
+                    );
+                }
+                println!();
+                print!(
+                    "{}",
+                    output::yellow("Stop all environments and proceed with update? [y/N] ")
+                );
+                let _ = std::io::stdout().flush();
+
+                let mut answer = String::new();
+                if std::io::stdin().read_line(&mut answer).is_err()
+                    || !answer.trim().eq_ignore_ascii_case("y")
+                {
+                    output::print_info("Update cancelled.");
+                    return 0;
+                }
+
+                // Stop all running environments.
+                let stopped = stop_all_environments(&running).await;
+                output::print_success(&format!("Stopped {stopped} environment(s)."));
+                println!();
+            }
+
             output::print_info(&format!("New version available: {current} → {new_version}"));
             output::print_info("Installing update...");
 
@@ -37,6 +79,65 @@ pub async fn run() -> i32 {
     }
 }
 
+/// Find all running environments across all projects.
+/// Returns (project_root, run_name) pairs.
+fn find_running_environments() -> Vec<(std::path::PathBuf, String)> {
+    let registry = match GlobalRegistry::load() {
+        Ok(r) => r,
+        Err(_) => return Vec::new(),
+    };
+
+    let mut running = Vec::new();
+    for entry in registry.projects.values() {
+        for (run_name, run_info) in &entry.runs {
+            if run_info.status == RunStatus::Running {
+                running.push((entry.project_root.clone(), run_name.clone()));
+            }
+        }
+    }
+    running
+}
+
+/// Stop all running environments. Returns number successfully stopped.
+async fn stop_all_environments(envs: &[(std::path::PathBuf, String)]) -> usize {
+    let mut stopped = 0;
+    for (project_root, run_name) in envs {
+        let config_path = project_root.join("veld.json");
+        let cfg = match config::load_config(&config_path) {
+            Ok(c) => c,
+            Err(e) => {
+                output::print_error(
+                    &format!("Failed to load config for {}: {e}", project_root.display()),
+                    false,
+                );
+                // Even if config can't load, try to clean up state.
+                cleanup_state(project_root, run_name);
+                continue;
+            }
+        };
+
+        let mut orchestrator = Orchestrator::new(config_path, cfg);
+        match orchestrator.stop(run_name).await {
+            Ok(_) => {
+                output::print_info(&format!("  Stopped '{run_name}'"));
+                stopped += 1;
+            }
+            Err(e) => {
+                output::print_error(&format!("  Failed to stop '{run_name}': {e}"), false);
+            }
+        }
+    }
+    stopped
+}
+
+/// Best-effort cleanup of state for a run when config can't be loaded.
+fn cleanup_state(project_root: &std::path::Path, run_name: &str) {
+    if let Ok(mut state) = ProjectState::load(project_root) {
+        state.runs.remove(run_name);
+        let _ = state.save(project_root);
+    }
+}
+
 /// Re-install the Hammerspoon Spoon if it was previously set up.
 /// The Spoon files are embedded in the binary, so they need to be re-extracted
 /// after every CLI update to pick up any changes.
diff --git a/crates/veld/src/main.rs b/crates/veld/src/main.rs
index 3058786..07ff954 100644
--- a/crates/veld/src/main.rs
+++ b/crates/veld/src/main.rs
@@ -156,7 +156,7 @@ enum Command {
         #[arg(long)]
         json: bool,
 
-        /// Filter by log source: all, server, or client.
+        /// Filter by log source: all, server, client, or internal (veld daemon liveness/recovery logs).
         #[arg(long, default_value = "all")]
         source: String,
 
@@ -351,7 +351,9 @@ async fn main() {
             let source_filter =
                 commands::logs::SourceFilter::from_str(&source).unwrap_or_else(|| {
                     output::print_error(
-                        &format!("Invalid --source value '{source}'. Use: all, server, client"),
+                        &format!(
+                            "Invalid --source value '{source}'. Use: all, server, client, internal"
+                        ),
                         json,
                     );
                     std::process::exit(1);
diff --git a/docs/configuration.md b/docs/configuration.md
index 05f5316..5b3b989 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -12,8 +12,8 @@ All relative paths in the configuration resolve relative to the directory contai
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "my-app",
   "nodes": {
     "backend": {
@@ -34,7 +34,7 @@ All relative paths in the configuration resolve relative to the directory contai
 | Field            | Type   | Required | Description                                      |
 |------------------|--------|----------|--------------------------------------------------|
 | `$schema`        | string | No       | JSON Schema URL for editor autocompletion         |
-| `schemaVersion`  | string | Yes      | Must be `"1"` for the current version             |
+| `schemaVersion`  | string | Yes      | `"1"` or `"2"`. Use `"2"` for new projects.       |
 | `name`           | string | Yes      | Human-readable project name                       |
 | `url_template`      | string | No       | URL template for services (see [URL Templates])   |
 | `presets`           | object | No       | Named shortcuts for node:variant selections       |
@@ -67,10 +67,10 @@ The name is available as the `{project}` variable in URL templates and as `${vel
 
 ### `schemaVersion`
 
-Must be `"1"`. Veld validates this on every command and exits with a clear error if it encounters an unknown version.
+Must be `"1"` or `"2"`. Use `"2"` for new projects — it enables `probes` and `skip_if`. Version `"1"` is still fully supported (uses legacy `health_check` and `verify` fields).
 
 ```json
-"schemaVersion": "1"
+"schemaVersion": "2"
 ```
 
 ### `url_template`
@@ -322,14 +322,15 @@ A variant defines how a node behaves in a given context. The same node might be
 | `type`              | string           | Yes      | All            | `"command"` or `"start_server"`                          |
 | `command`           | string           | Varies   | All            | Inline shell command to execute                       |
 | `script`            | string           | Varies   | `command` only    | Path to script file, relative to `veld.json`          |
-| `health_check`      | object           | Required for `start_server` | `start_server` | How to verify the service is healthy |
+| `health_check`      | object           | No       | `start_server` | Legacy readiness probe. Deprecated: use `probes.readiness` |
+| `probes`            | object           | No       | All            | Readiness and liveness probe configuration            |
 | `depends_on`        | object           | No       | All            | Dependencies on other nodes                           |
 | `env`               | object           | No       | All            | Extra environment variables                           |
 | `outputs`           | array or object  | No       | All            | Output declarations (format varies by type)           |
 | `sensitive_outputs`  | array of strings | No       | All            | Output keys to mask and encrypt                       |
 | `url_template`      | string           | No       | `start_server` | URL template override for this variant                |
 | `on_stop`           | string           | No       | All            | Teardown command run when the environment is stopped  |
-| `verify`            | string           | No       | `command` only    | Idempotency verification command                      |
+| `skip_if`           | string           | No       | `command` only    | Idempotency check — skip if exits 0 (alias: `verify`)|
 | `client_log_levels` | array of strings | No       | `start_server` | Browser log levels override for this variant          |
 | `features`          | object           | No       | `start_server` | Feature toggles override for this variant             |
 
@@ -343,7 +344,7 @@ Runs a shell command or script to completion. Used for setup tasks such as datab
 - Must specify either `command` or `script` (mutually exclusive)
 - Can declare outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged — exposes values in terminal/logs)
 - Built-in output: `exit_code`
-- Supports the `verify` field for idempotency
+- Supports the `skip_if` field for idempotency
 
 ```json
 {
@@ -363,7 +364,7 @@ Starts and manages a long-lived process. Veld allocates a port, injects it as `$
 - Built-in outputs: `url` (the full HTTPS URL) and `port` (the allocated port number)
 - Built-in variables: `${veld.port}` and `${veld.url}` are available in this node's `command`, `env`, and `outputs` templates
 - Ports and URLs are **pre-computed** before any node executes, so `${nodes.X.url}` and `${nodes.X.port}` for any `start_server` node are available everywhere -- no dependency edge required
-- Supports the `health_check` field (required)
+- Requires a readiness probe: use `probes.readiness` (preferred) or the legacy `health_check` field
 - Users never see or deal with port numbers -- only clean HTTPS URLs
 
 ```json
@@ -451,6 +452,51 @@ Runs a shell command and checks the exit code. Exit code `0` means healthy.
 }
 ```
 
+### `probes`
+
+Configures readiness and liveness probes for a variant. Available for both `command` and `start_server` types. `probes.readiness` supersedes the legacy `health_check` field.
+
+```json
+"probes": {
+  "readiness": {
+    "type": "http",
+    "path": "/health",
+    "timeout_seconds": 30
+  },
+  "liveness": {
+    "type": "command",
+    "command": "pg_isready -h localhost -p 5432",
+    "interval_ms": 5000,
+    "failure_threshold": 3,
+    "max_recoveries": 3
+  }
+}
+```
+
+#### Readiness Probe
+
+Gates the dependency graph during startup. Same fields as `health_check`. For `start_server` nodes, runs after the process starts. For `command` nodes, runs after the command exits 0.
+
+#### Liveness Probe
+
+Runs continuously after the node becomes healthy. Detects failures like dropped SSH tunnels, crashed background processes, or unreachable databases. Supports the same three check types as readiness probes:
+
+- **`http`**: Polls an HTTP endpoint. Passes when the expected status code is returned.
+- **`port`**: Checks if a TCP port is accepting connections.
+- **`command`**: Runs an arbitrary shell command (via `sh -c`). Exit code `0` means healthy, non-zero means unhealthy. Pipes, redirects, and `&&` chains all work. The node's outputs are injected as environment variables, so you can reference them directly (e.g., `pg_isready -h $DB_HOST -p $DB_PORT`).
+
+| Field               | Type    | Required | Description                                                  |
+|---------------------|---------|----------|--------------------------------------------------------------|
+| `type`              | string  | Yes      | Strategy: `"http"`, `"port"`, or `"command"`                 |
+| `path`              | string  | No       | HTTP path to poll (`http` type only)                         |
+| `expect_status`     | integer | No       | Expected HTTP status code (`http` type only, default: 200)   |
+| `command`           | string  | No       | Shell command to run (`command` type only)                    |
+| `interval_ms`       | integer | No       | Milliseconds between checks (default: 5000, min: 1000)      |
+| `failure_threshold` | integer | No       | Consecutive failures before triggering recovery (default: 3) |
+| `max_recoveries`    | integer | No       | Max recovery attempts before permanent failure (default: 3)  |
+
+When `failure_threshold` consecutive liveness checks fail, Veld automatically restarts the entire environment (equivalent to `veld restart`). If the restart succeeds and the probe starts passing, the node returns to healthy. If `max_recoveries` restart attempts are exhausted, the node is marked as permanently failed and no further restarts are attempted. You can see recovery status via `veld status` and `veld logs --source internal`.
+
 ### `depends_on`
 
 Declares dependencies as explicit `node:variant` pairs. Dependencies are resolved before this variant starts. The value is an object mapping node names to variant names.
@@ -554,21 +600,21 @@ An array of output key names whose values are sensitive. These outputs are:
 }
 ```
 
-### `verify`
+### `skip_if`
 
-An idempotency verification command. Only applies to `command` type variants. Before running the main command/script, Veld executes the verify command:
+An idempotency check command (previously named `verify`, which is still accepted as an alias). Only applies to `command` type variants. Before running the main command/script, Veld executes the `skip_if` command:
 
 - **Exit code 0:** The step is considered already complete and is skipped.
 - **Non-zero exit code:** The step runs normally.
-- If `verify` itself errors unexpectedly, the step re-runs (safe default).
+- If `skip_if` itself errors unexpectedly, the step re-runs (safe default).
 
-The verify command receives the previous run's output variables as environment variables, so it can check whether the previous result is still valid.
+The `skip_if` command receives the previous run's output variables as environment variables, so it can check whether the previous result is still valid.
 
 ```json
 {
   "type": "command",
   "script": "./scripts/clone-db.sh",
-  "verify": "./scripts/verify-db.sh",
+  "skip_if": "./scripts/verify-db.sh",
   "outputs": ["DATABASE_URL"]
 }
 ```
@@ -851,8 +897,8 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "my-project",
   "url_template": "{service}.{branch ?? run}.my-project.localhost",
 
@@ -868,7 +914,7 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr
         "local": {
           "type": "command",
           "script": "./scripts/clone-db.sh",
-          "verify": "./scripts/verify-db.sh",
+          "skip_if": "./scripts/verify-db.sh",
           "on_stop": "./scripts/drop-db.sh",
           "outputs": ["DATABASE_URL"],
           "sensitive_outputs": ["DATABASE_URL"]
@@ -894,7 +940,7 @@ Below is a realistic `veld.json` for a monorepo with a database, backend API, fr
         "default": {
           "type": "command",
           "command": "./scripts/generate-dev-certs.sh",
-          "verify": "test -f ./certs/dev.pem"
+          "skip_if": "test -f ./certs/dev.pem"
         }
       }
     },
@@ -1055,7 +1101,7 @@ Veld provides a JSON Schema for editor autocompletion and validation. Add the `$
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
   ...
 }
 ```
diff --git a/docs/scenarios.md b/docs/scenarios.md
index b7cc787..72bef02 100644
--- a/docs/scenarios.md
+++ b/docs/scenarios.md
@@ -35,8 +35,8 @@ For the full field reference, see [configuration.md](./configuration.md).
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "shopfront",
   "url_template": "{service}.{branch ?? run}.shopfront.localhost",
 
@@ -47,7 +47,7 @@ For the full field reference, see [configuration.md](./configuration.md).
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=shopfront -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/shopfront"
           }
@@ -60,7 +60,7 @@ For the full field reference, see [configuration.md](./configuration.md).
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @shopfront/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "database": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.database.DATABASE_URL}",
@@ -75,7 +75,7 @@ For the full field reference, see [configuration.md](./configuration.md).
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @shopfront/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "local" },
           "env": {
             "PORT": "${veld.port}",
@@ -98,8 +98,8 @@ For the full field reference, see [configuration.md](./configuration.md).
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "portal",
   "url_template": "{service}.{branch ?? run}.portal.localhost",
 
@@ -109,7 +109,7 @@ For the full field reference, see [configuration.md](./configuration.md).
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @portal/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "env": {
             "CORS_ORIGIN": "${nodes.frontend.url}",
             "ALLOWED_ORIGINS": "${nodes.frontend.url},${nodes.admin.url}"
@@ -123,7 +123,7 @@ For the full field reference, see [configuration.md](./configuration.md).
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @portal/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_API_URL": "${nodes.backend.url}"
@@ -137,7 +137,7 @@ For the full field reference, see [configuration.md](./configuration.md).
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @portal/admin dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_API_URL": "${nodes.backend.url}"
@@ -161,8 +161,8 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "taskboard",
   "url_template": "{service}.{branch ?? run}.taskboard.localhost",
 
@@ -173,12 +173,12 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=taskboard -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "command",
             "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres",
             "timeout_seconds": 30,
             "interval_ms": 2000
-          },
+          } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/taskboard"
           }
@@ -192,11 +192,11 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
           "type": "start_server",
           "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine",
           "on_stop": "docker stop veld-redis-${veld.run}",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "command",
             "command": "docker exec veld-redis-${veld.run} redis-cli ping",
             "timeout_seconds": 15
-          },
+          } },
           "outputs": {
             "REDIS_URL": "redis://localhost:${veld.port}/0"
           }
@@ -209,7 +209,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
         "local": {
           "type": "start_server",
           "command": "cargo run --bin taskboard-api -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 },
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } },
           "depends_on": {
             "postgres": "docker",
             "redis": "docker"
@@ -235,8 +235,8 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "dashboard",
   "url_template": "{service}.{branch ?? run}.dashboard.localhost",
 
@@ -252,7 +252,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=dashboard -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/dashboard"
           }
@@ -266,7 +266,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @dashboard/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "database": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.database.DATABASE_URL}"
@@ -286,7 +286,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @dashboard/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "local" },
           "env": {
             "PORT": "${veld.port}",
@@ -296,7 +296,7 @@ Note: This pre-computation only applies to the built-in `url` and `port` outputs
         "staging": {
           "type": "start_server",
           "command": "pnpm --filter @dashboard/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "staging" },
           "env": {
             "PORT": "${veld.port}",
@@ -324,8 +324,8 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "analytics",
   "url_template": "{service}.{branch ?? run}.analytics.localhost",
 
@@ -336,11 +336,11 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 -v veld-pg-data-${veld.run}:/var/lib/postgresql/data postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "command",
             "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres",
             "timeout_seconds": 30
-          },
+          } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/analytics"
           }
@@ -354,7 +354,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "default": {
           "type": "command",
           "command": "pg_dump $SOURCE_DB_URL | psql ${nodes.postgres.DATABASE_URL}",
-          "verify": "psql ${nodes.postgres.DATABASE_URL} -c 'SELECT 1 FROM users LIMIT 1'",
+          "skip_if": "psql ${nodes.postgres.DATABASE_URL} -c 'SELECT 1 FROM users LIMIT 1'",
           "depends_on": { "postgres": "docker" },
           "env": {
             "SOURCE_DB_URL": "postgresql://readonly:secret@staging.analytics.example.com:5432/analytics"
@@ -370,7 +370,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @analytics/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "clone-db": "default" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}"
@@ -382,7 +382,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
 }
 ```
 
-**What happens:** The `clone-db` node depends on `postgres:docker`. Before running the expensive `pg_dump | psql` pipeline, Veld executes the `verify` command. If the `users` table already has data (`SELECT 1` succeeds), the clone is skipped entirely. On the first run, the clone executes. On subsequent runs, it is a no-op. The `hidden: true` flag keeps `clone-db` out of `veld nodes` output since it is an internal concern.
+**What happens:** The `clone-db` node depends on `postgres:docker`. Before running the expensive `pg_dump | psql` pipeline, Veld executes the `skip_if` command. If the `users` table already has data (`SELECT 1` succeeds), the clone is skipped entirely. On the first run, the clone executes. On subsequent runs, it is a no-op. The `hidden: true` flag keeps `clone-db` out of `veld nodes` output since it is an internal concern.
 
 ---
 
@@ -392,8 +392,8 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "rideshare",
   "url_template": "{service}.{branch ?? run}.rideshare.localhost",
 
@@ -410,7 +410,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/rideshare"
           }
@@ -424,7 +424,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
           "type": "start_server",
           "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine",
           "on_stop": "docker stop veld-redis-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 15 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } },
           "outputs": {
             "REDIS_URL": "redis://localhost:${veld.port}/0"
           }
@@ -437,7 +437,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "cargo run --bin rider-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 },
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } },
           "depends_on": { "postgres": "docker", "redis": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}",
@@ -453,7 +453,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "cargo run --bin driver-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 },
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } },
           "depends_on": { "postgres": "docker", "redis": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}",
@@ -468,7 +468,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "cargo run --bin pricing-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 },
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } },
           "depends_on": { "redis": "docker" },
           "env": {
             "REDIS_URL": "${nodes.redis.REDIS_URL}"
@@ -482,7 +482,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "cargo run --bin notification-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 },
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } } },
           "depends_on": { "redis": "docker" },
           "env": {
             "REDIS_URL": "${nodes.redis.REDIS_URL}"
@@ -496,7 +496,7 @@ Note the qualified form `${nodes.backend:local.url}` and `${nodes.backend:stagin
         "local": {
           "type": "start_server",
           "command": "cargo run --bin gateway -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "rider-service": "local",
             "driver-service": "local",
@@ -535,8 +535,8 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "enterprise-app",
   "url_template": "{service}.{branch ?? run}.enterprise-app.localhost",
 
@@ -547,11 +547,11 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=enterprise -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "command",
             "command": "docker exec veld-pg-${veld.run} pg_isready -U postgres",
             "timeout_seconds": 30
-          },
+          } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/enterprise"
           }
@@ -565,7 +565,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
         "default": {
           "type": "command",
           "command": "./scripts/generate-dev-certs.sh",
-          "verify": "test -f ./certs/dev.pem && test -f ./certs/dev-key.pem",
+          "skip_if": "test -f ./certs/dev.pem && test -f ./certs/dev-key.pem",
           "outputs": ["CERT_PATH", "KEY_PATH"]
         }
       }
@@ -581,7 +581,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}"
           },
-          "verify": "pnpm --filter @enterprise/db migrate:status --exit-code"
+          "skip_if": "pnpm --filter @enterprise/db migrate:status --exit-code"
         }
       }
     },
@@ -596,7 +596,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}"
           },
-          "verify": "pnpm --filter @enterprise/db seed:check"
+          "skip_if": "pnpm --filter @enterprise/db seed:check"
         }
       }
     },
@@ -606,7 +606,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @enterprise/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "seed-db": "default",
             "generate-certs": "default"
@@ -625,7 +625,7 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @enterprise/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "local" },
           "env": {
             "PORT": "${veld.port}",
@@ -641,12 +641,12 @@ The presets let you run subsets: `--preset riders-only` starts only the rider se
 **What happens:**
 
 1. `postgres:docker` and `generate-certs:default` start in parallel (independent).
-2. Once Postgres is ready, `migrate-db` runs (skipped if migrations are current, thanks to `verify`).
+2. Once Postgres is ready, `migrate-db` runs (skipped if migrations are current, thanks to `skip_if`).
 3. Once migrations complete, `seed-db` runs (skipped if seed data exists).
 4. Once both `seed-db` and `generate-certs` finish, `backend:local` starts.
 5. Finally, `frontend:local` starts.
 
-The `verify` commands on the setup nodes make subsequent `veld start` calls fast -- if certs exist, migrations are current, and seed data is present, all three setup steps are skipped in milliseconds.
+The `skip_if` commands on the setup nodes make subsequent `veld start` calls fast -- if certs exist, migrations are current, and seed data is present, all three setup steps are skipped in milliseconds.
 
 ---
 
@@ -656,8 +656,8 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "search-platform",
   "url_template": "{service}.{branch ?? run}.search-platform.localhost",
 
@@ -668,7 +668,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=search_platform -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/search_platform",
             "JDBC_URL": "jdbc:postgresql://localhost:${veld.port}/search_platform"
@@ -683,7 +683,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
           "type": "start_server",
           "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine --appendonly yes",
           "on_stop": "docker stop veld-redis-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 15 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } },
           "outputs": {
             "REDIS_URL": "redis://localhost:${veld.port}/0",
             "REDIS_CACHE_URL": "redis://localhost:${veld.port}/1",
@@ -699,12 +699,12 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
           "type": "start_server",
           "command": "docker run --rm --name veld-es-${veld.run} -e discovery.type=single-node -e xpack.security.enabled=false -e ES_JAVA_OPTS='-Xms512m -Xmx512m' -p ${veld.port}:9200 elasticsearch:8.13.0",
           "on_stop": "docker stop veld-es-${veld.run}",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "http",
             "path": "/_cluster/health",
             "timeout_seconds": 90,
             "interval_ms": 3000
-          },
+          } } },
           "outputs": {
             "ELASTICSEARCH_URL": "http://localhost:${veld.port}"
           }
@@ -717,12 +717,12 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @search-platform/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "postgres": "docker",
             "redis": "docker",
             "elasticsearch": "docker"
-          },
+          } },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}",
             "REDIS_URL": "${nodes.redis.REDIS_URL}",
@@ -747,8 +747,8 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "crm",
   "url_template": "{service}.{branch ?? run}.crm.localhost",
 
@@ -759,7 +759,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.branch}-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=crm_${veld.branch} -p ${veld.port}:5432 -v veld-pg-${veld.branch}:/var/lib/postgresql/data postgres:16",
           "on_stop": "docker stop veld-pg-${veld.branch}-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/crm_${veld.branch}"
           }
@@ -772,7 +772,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @crm/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "database": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.database.DATABASE_URL}"
@@ -786,7 +786,7 @@ The `verify` commands on the setup nodes make subsequent `veld start` calls fast
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @crm/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "local" },
           "env": {
             "PORT": "${veld.port}",
@@ -815,8 +815,8 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "inventory",
   "url_template": "{service}.{username}.{branch ?? run}.inventory.localhost",
 
@@ -826,7 +826,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
         "local": {
           "type": "start_server",
           "command": "go run ./cmd/server --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/healthz" },
+          "probes": { "readiness": { "type": "http", "path": "/healthz" } } },
           "env": {
             "FRONTEND_URL": "${nodes.frontend.url}"
           }
@@ -839,7 +839,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @inventory/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_API_URL": "${nodes.backend.url}"
@@ -861,8 +861,8 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "fintech",
   "url_template": "{service}.{branch ?? run}.fintech.localhost",
 
@@ -873,7 +873,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
         "default": {
           "type": "command",
           "script": "./scripts/provision-dev-db.sh",
-          "verify": "./scripts/check-dev-db.sh",
+          "skip_if": "./scripts/check-dev-db.sh",
           "on_stop": "./scripts/teardown-dev-db.sh",
           "outputs": ["DATABASE_URL", "DB_PASSWORD", "DB_READONLY_URL"],
           "sensitive_outputs": ["DATABASE_URL", "DB_PASSWORD", "DB_READONLY_URL"]
@@ -898,7 +898,7 @@ Each branch also gets its own Docker volume (`veld-pg-${veld.branch}`) and datab
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @fintech/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "provision-db": "default",
             "fetch-api-keys": "default"
@@ -931,8 +931,8 @@ The `api` node receives the secrets as environment variables at runtime.
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "saas-platform",
   "url_template": "{service}.{branch ?? run}.saas-platform.test",
 
@@ -942,7 +942,7 @@ The `api` node receives the secrets as environment variables at runtime.
         "local": {
           "type": "start_server",
           "command": "go run ./cmd/api --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "env": {
             "COOKIE_DOMAIN": ".saas-platform.test",
             "CORS_ORIGINS": "${nodes.web.url},${nodes.admin.url}"
@@ -956,7 +956,7 @@ The `api` node receives the secrets as environment variables at runtime.
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @saas/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_API_URL": "${nodes.api.url}"
@@ -971,7 +971,7 @@ The `api` node receives the secrets as environment variables at runtime.
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @saas/admin dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_API_URL": "${nodes.api.url}"
@@ -997,8 +997,8 @@ The `admin` node uses a node-level `url_template` override to produce a differen
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "marketplace",
   "url_template": "{service}.{branch ?? run}.marketplace.localhost",
 
@@ -1015,7 +1015,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen
         "local": {
           "type": "start_server",
           "command": "cargo run --bin user-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } }
         },
         "staging": {
           "type": "command",
@@ -1031,7 +1031,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen
         "local": {
           "type": "start_server",
           "command": "cargo run --bin catalog-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } }
         },
         "staging": {
           "type": "command",
@@ -1047,7 +1047,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen
         "local": {
           "type": "start_server",
           "command": "cargo run --bin payment-service -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health", "timeout_seconds": 60 }
+          "probes": { "readiness": { "type": "http", "path": "/health", "timeout_seconds": 60 } }
         },
         "staging": {
           "type": "command",
@@ -1073,7 +1073,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen
         "local": {
           "type": "start_server",
           "command": "cargo run --bin gateway -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "user-service": "staging",
             "catalog-service": "local",
@@ -1090,7 +1090,7 @@ The `admin` node uses a node-level `url_template` override to produce a differen
         "local-full": {
           "type": "start_server",
           "command": "cargo run --bin gateway -- --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "user-service": "local",
             "catalog-service": "local",
@@ -1122,8 +1122,8 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "docs-site",
   "url_template": "{service}.{branch ?? run}.docs-site.localhost",
 
@@ -1134,7 +1134,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "default": {
           "type": "command",
           "command": "pnpm --filter @docs-site/content build",
-          "verify": "test -d ./packages/content/dist && test ./packages/content/dist/index.html -nt ./packages/content/src/index.md"
+          "skip_if": "test -d ./packages/content/dist && test ./packages/content/dist/index.html -nt ./packages/content/src/index.md"
         }
       }
     },
@@ -1145,7 +1145,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "default": {
           "type": "command",
           "command": "cargo build --release --bin docs-api",
-          "verify": "test -f ./target/release/docs-api && test ./target/release/docs-api -nt ./src/main.rs"
+          "skip_if": "test -f ./target/release/docs-api && test ./target/release/docs-api -nt ./src/main.rs"
         }
       }
     },
@@ -1155,7 +1155,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "local": {
           "type": "start_server",
           "command": "./target/release/docs-api --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "build-api": "default" }
         }
       }
@@ -1166,7 +1166,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "local": {
           "type": "start_server",
           "command": "python3 -m http.server ${veld.port} --directory ./packages/content/dist",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "build-docs": "default" }
         }
       }
@@ -1175,7 +1175,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
 }
 ```
 
-**What happens:** The `build-docs` and `build-api` command nodes run first (in parallel, since they are independent). The `verify` commands check whether the build artifacts are newer than the source files -- if so, the builds are skipped. Then `docs:local` serves the static files and `api:local` runs the compiled binary. On first run, both builds execute. On subsequent runs, they are skipped unless source files changed.
+**What happens:** The `build-docs` and `build-api` command nodes run first (in parallel, since they are independent). The `skip_if` commands check whether the build artifacts are newer than the source files -- if so, the builds are skipped. Then `docs:local` serves the static files and `api:local` runs the compiled binary. On first run, both builds execute. On subsequent runs, they are skipped unless source files changed.
 
 ---
 
@@ -1185,8 +1185,8 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "jobrunner",
   "url_template": "{service}.{branch ?? run}.jobrunner.localhost",
 
@@ -1202,7 +1202,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=jobrunner -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/jobrunner"
           }
@@ -1216,7 +1216,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
           "type": "start_server",
           "command": "docker run --rm --name veld-redis-${veld.run} -p ${veld.port}:6379 redis:7-alpine",
           "on_stop": "docker stop veld-redis-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 15 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 15 } } },
           "outputs": {
             "REDIS_URL": "redis://localhost:${veld.port}/0"
           }
@@ -1229,7 +1229,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @jobrunner/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "postgres": "docker",
             "redis": "docker"
@@ -1248,7 +1248,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @jobrunner/worker start --port ${veld.port} --concurrency 5",
-          "health_check": { "type": "http", "path": "/status" },
+          "probes": { "readiness": { "type": "http", "path": "/status" } } },
           "depends_on": {
             "postgres": "docker",
             "redis": "docker"
@@ -1267,7 +1267,7 @@ Note the use of qualified references (`${nodes.catalog-service:local.url}` vs `$
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @jobrunner/scheduler start --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": {
             "redis": "docker"
           },
@@ -1300,8 +1300,8 @@ All three application nodes are `start_server` with health check endpoints, so V
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "polyglot",
   "url_template": "{service}.{branch ?? run}.polyglot.localhost",
 
@@ -1312,7 +1312,7 @@ All three application nodes are `start_server` with health check endpoints, so V
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=polyglot -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/polyglot",
             "JDBC_URL": "jdbc:postgresql://localhost:${veld.port}/polyglot"
@@ -1326,7 +1326,7 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "cd services/auth && go run . --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/healthz" },
+          "probes": { "readiness": { "type": "http", "path": "/healthz" } } },
           "depends_on": { "postgres": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}"
@@ -1340,12 +1340,12 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "cd services/billing && ./gradlew bootRun --args='--server.port=${veld.port}'",
-          "health_check": {
+          "probes": { "readiness": {
             "type": "http",
             "path": "/actuator/health",
             "timeout_seconds": 90,
             "interval_ms": 3000
-          },
+          } } },
           "depends_on": { "postgres": "docker" },
           "env": {
             "SPRING_DATASOURCE_URL": "${nodes.postgres.JDBC_URL}",
@@ -1361,7 +1361,7 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "cd services/recommendations && uvicorn main:app --host 0.0.0.0 --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "postgres": "docker" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}"
@@ -1375,12 +1375,12 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @polyglot/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": {
             "auth-service": "local",
             "billing-service": "local",
             "recommendation-service": "local"
-          },
+          } },
           "env": {
             "PORT": "${veld.port}",
             "NEXT_PUBLIC_AUTH_URL": "${nodes.auth-service.url}",
@@ -1404,8 +1404,8 @@ All three application nodes are `start_server` with health check endpoints, so V
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "e2e-suite",
   "url_template": "{service}.{branch ?? run}.e2e-suite.localhost",
 
@@ -1421,7 +1421,7 @@ All three application nodes are `start_server` with health check endpoints, so V
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} -e POSTGRES_PASSWORD=veld -e POSTGRES_DB=e2e -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/e2e"
           }
@@ -1448,7 +1448,7 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @e2e-suite/api dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "seed-test-data": "default" },
           "env": {
             "DATABASE_URL": "${nodes.postgres.DATABASE_URL}",
@@ -1463,7 +1463,7 @@ All three application nodes are `start_server` with health check endpoints, so V
         "local": {
           "type": "start_server",
           "command": "pnpm --filter @e2e-suite/web dev",
-          "health_check": { "type": "http", "path": "/" },
+          "probes": { "readiness": { "type": "http", "path": "/" } } },
           "depends_on": { "backend": "local" },
           "env": {
             "PORT": "${veld.port}",
@@ -1512,8 +1512,8 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "platform",
   "url_template": "{service}.{branch ?? run}.platform.localhost",
 
@@ -1535,7 +1535,7 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne
           "type": "start_server",
           "command": "docker run --rm --name veld-pg-${veld.run} --network ${veld.project}-net -e POSTGRES_PASSWORD=veld -p ${veld.port}:5432 postgres:16",
           "on_stop": "docker stop veld-pg-${veld.run}",
-          "health_check": { "type": "port", "timeout_seconds": 30 },
+          "probes": { "readiness": { "type": "port", "timeout_seconds": 30 } } },
           "outputs": {
             "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/app"
           }
@@ -1547,7 +1547,7 @@ With `--preset dev`, only the frontend and its dependencies start (no test runne
         "local": {
           "type": "start_server",
           "command": "pnpm --filter backend dev --port ${veld.port}",
-          "health_check": { "type": "http", "path": "/health" },
+          "probes": { "readiness": { "type": "http", "path": "/health" } } },
           "depends_on": { "postgres": "docker" },
           "env": { "DATABASE_URL": "${nodes.postgres.DATABASE_URL}" }
         }
diff --git a/install.sh b/install.sh
index c355398..e405ed6 100755
--- a/install.sh
+++ b/install.sh
@@ -210,6 +210,71 @@ fi
 
 # --- Install ---
 
+# --- Stop running environments (prevents stale state files after upgrade) ---
+#
+# When upgrading across major versions, state file format may change.
+# Stop all running environments using the OLD binary so state is cleanly
+# removed before the new binary is installed.
+
+EXISTING_VELD_BIN="$(command -v veld 2>/dev/null || true)"
+if [ -n "$EXISTING_VELD_BIN" ]; then
+  # Use `veld list --json` to find running environments across ALL projects.
+  # Parse with basic grep — no jq dependency required.
+  LIST_JSON="$("$EXISTING_VELD_BIN" list --json 2>/dev/null || true)"
+
+  # Extract project_root + run_name pairs for running environments.
+  # The JSON structure is: { "projects": { "<path>": { "runs": { "<name>": { "status": "running" } } } } }
+  RUNNING_INFO=""
+  if [ -n "$LIST_JSON" ]; then
+    # Use python3 (available on macOS and most Linux) for reliable JSON parsing.
+    RUNNING_INFO="$(echo "$LIST_JSON" | python3 -c '
+import json, sys
+try:
+    data = json.load(sys.stdin)
+    for path, proj in data.get("projects", {}).items():
+        for name, run in proj.get("runs", {}).items():
+            if run.get("status") == "running":
+                print(f"{path}\t{name}")
+except: pass
+' 2>/dev/null || true)"
+  fi
+
+  if [ -n "$RUNNING_INFO" ]; then
+    echo ""
+    echo "============================================================"
+    echo "  RUNNING ENVIRONMENTS DETECTED"
+    echo "============================================================"
+    echo ""
+    echo "  The following environments will be stopped before updating"
+    echo "  (prevents stale state files after upgrade):"
+    echo ""
+    echo "$RUNNING_INFO" | while IFS=$'\t' read -r proj_root run_name; do
+      echo "    - ${run_name}  (${proj_root})"
+    done
+    echo ""
+
+    if [ -z "${VELD_NON_INTERACTIVE:-}" ] && [ -t 0 ]; then
+      printf "  Stop all and continue? [Y/n] "
+      read -r answer < /dev/tty 2>/dev/null || answer="y"
+      answer="${answer:-y}"
+      if [ "$answer" != "y" ] && [ "$answer" != "Y" ]; then
+        echo "Update cancelled."
+        exit 0
+      fi
+    fi
+
+    echo "  Stopping environments..."
+    echo "$RUNNING_INFO" | while IFS=$'\t' read -r proj_root run_name; do
+      if (cd "$proj_root" 2>/dev/null && "$EXISTING_VELD_BIN" stop --name "$run_name" 2>/dev/null); then
+        echo "    Stopped '${run_name}'"
+      else
+        echo "    Warning: could not stop '${run_name}' (may need manual cleanup)"
+      fi
+    done
+    echo ""
+  fi
+fi
+
 echo "Installing binaries..."
 $NEED_SUDO mkdir -p "$INSTALL_DIR"
 $NEED_SUDO mkdir -p "$LIB_DIR"
diff --git a/schema/v1/veld.schema.json b/schema/v1/veld.schema.json
index 08ee57e..c5caf28 100644
--- a/schema/v1/veld.schema.json
+++ b/schema/v1/veld.schema.json
@@ -46,28 +46,6 @@
       "$ref": "#/$defs/ClientLogLevels",
       "description": "Client-side log levels to capture (project-level default). Overridable at node and variant level."
     },
-    "features": {
-      "$ref": "#/$defs/FeaturesConfig",
-      "description": "Feature toggles (project-level defaults). Overridable at node and variant level."
-    },
-    "env": {
-      "$ref": "#/$defs/EnvMap",
-      "description": "Global environment variables inherited by all node variants. Overridable at node and variant level."
-    },
-    "setup": {
-      "type": "array",
-      "description": "Steps that run sequentially before the dependency graph executes. If any step exits non-zero, startup is aborted. Not nodes — no variants, no health checks, no dependency graph participation.",
-      "items": {
-        "$ref": "#/$defs/SetupStep"
-      }
-    },
-    "teardown": {
-      "type": "array",
-      "description": "Steps that run sequentially after all nodes stop (after per-node on_stop hooks). Best-effort: failures are logged but do not block the stop operation. Commands should be idempotent.",
-      "items": {
-        "$ref": "#/$defs/SetupStep"
-      }
-    },
     "nodes": {
       "type": "object",
       "description": "The dependency graph nodes. Each key is a node name, and the value defines its variants.",
@@ -78,26 +56,6 @@
     }
   },
   "$defs": {
-    "SetupStep": {
-      "type": "object",
-      "description": "A lightweight step that runs before the dependency graph (setup) or after all nodes stop (teardown). Supports shell environment variables and project-level Veld variables: ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}.",
-      "required": ["name", "command"],
-      "additionalProperties": false,
-      "properties": {
-        "name": {
-          "type": "string",
-          "description": "Human-readable name for progress reporting and error messages."
-        },
-        "command": {
-          "type": "string",
-          "description": "Shell command to execute. Supports ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}, and shell environment variables."
-        },
-        "failureMessage": {
-          "type": "string",
-          "description": "Optional human-friendly message shown when the command fails (non-zero exit). Useful for prerequisite checks like 'Docker must be running'."
-        }
-      }
-    },
     "NodeConfig": {
       "type": "object",
       "description": "Configuration for a single node in the dependency graph.",
@@ -121,18 +79,6 @@
           "$ref": "#/$defs/ClientLogLevels",
           "description": "Client-side log levels override for all variants of this node."
         },
-        "features": {
-          "$ref": "#/$defs/FeaturesConfig",
-          "description": "Feature toggles override for all variants of this node."
-        },
-        "env": {
-          "$ref": "#/$defs/EnvMap",
-          "description": "Extra environment variables inherited by all variants of this node. Overrides project-level env. Overridable at variant level."
-        },
-        "cwd": {
-          "type": "string",
-          "description": "Working directory for all variants of this node. Relative paths are resolved from the project root. Overridable at variant level. Supports Veld variable substitution."
-        },
         "variants": {
           "type": "object",
           "description": "Available variants for this node. Each key is a variant name.",
@@ -174,11 +120,14 @@
           }
         },
         "env": {
-          "$ref": "#/$defs/EnvMap",
-          "description": "Extra environment variables injected into the process. Overrides node-level and project-level env."
+          "type": "object",
+          "description": "Extra environment variables injected into the process.",
+          "additionalProperties": {
+            "type": "string"
+          }
         },
         "outputs": {
-          "description": "Output declarations. For command steps: an array of output names captured from $VELD_OUTPUT_FILE (preferred) or VELD_OUTPUT stdout lines (legacy). For start_server steps: an object mapping output names to template strings.",
+          "description": "Output declarations. For command steps: an array of output names captured from VELD_OUTPUT. For start_server steps: an object mapping output names to template strings.",
           "oneOf": [
             {
               "type": "array",
@@ -206,7 +155,7 @@
         "strict_outputs": {
           "type": "boolean",
           "default": true,
-          "description": "When true (default), fail if the command produces output keys (via $VELD_OUTPUT_FILE or stdout) not declared in \"outputs\". Set to false to silently ignore undeclared outputs."
+          "description": "When true (default), fail if the command produces VELD_OUTPUT keys not declared in \"outputs\". Set to false to silently ignore undeclared outputs."
         },
         "verify": {
           "type": "string",
@@ -223,14 +172,6 @@
         "client_log_levels": {
           "$ref": "#/$defs/ClientLogLevels",
           "description": "Client-side log levels override for this specific variant."
-        },
-        "features": {
-          "$ref": "#/$defs/FeaturesConfig",
-          "description": "Feature toggles override for this specific variant."
-        },
-        "cwd": {
-          "type": "string",
-          "description": "Working directory for this variant. Relative paths are resolved from the project root. Overrides node-level cwd. Supports Veld variable substitution."
         }
       },
       "allOf": [
@@ -257,28 +198,6 @@
         }
       ]
     },
-    "FeaturesConfig": {
-      "type": "object",
-      "description": "Feature toggles for controlling which Veld capabilities are injected into a server's HTML responses. All properties are optional — omitted values inherit from the parent level (variant inherits from node, node from project). The built-in default for all features is true (enabled).",
-      "additionalProperties": false,
-      "properties": {
-        "feedback_overlay": {
-          "type": "boolean",
-          "description": "Inject the feedback overlay toolbar (FAB, screenshot, comments) into HTML responses. Default: true.",
-          "default": true
-        },
-        "client_logs": {
-          "type": "boolean",
-          "description": "Inject the client-side log collector into HTML responses. Default: true.",
-          "default": true
-        },
-        "inject": {
-          "type": "boolean",
-          "description": "Automatically inject bootstrap scripts into HTML responses. When false, the /__veld__/* proxy routes are still created so you can manually add <script> tags. Default: true.",
-          "default": true
-        }
-      }
-    },
     "ClientLogLevels": {
       "type": "array",
       "description": "Client-side console log levels to capture. Supported values: \"log\", \"warn\", \"error\", \"info\", \"debug\". Unhandled exceptions (\"exception\") are always captured regardless of this setting.",
@@ -289,13 +208,6 @@
       "default": ["log", "warn", "error"],
       "uniqueItems": true
     },
-    "EnvMap": {
-      "type": "object",
-      "description": "A map of environment variable names to values. Values support Veld variable substitution (e.g. ${veld.port}, ${nodes.backend.url}).",
-      "additionalProperties": {
-        "type": "string"
-      }
-    },
     "HealthCheck": {
       "type": "object",
       "description": "Health check configuration for a start_server variant.",
diff --git a/schema/v2/veld.schema.json b/schema/v2/veld.schema.json
new file mode 100644
index 0000000..08777d3
--- /dev/null
+++ b/schema/v2/veld.schema.json
@@ -0,0 +1,478 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "title": "Veld Project Configuration",
+  "description": "Configuration file for a Veld local development environment. Place this as veld.json in your project root.",
+  "type": "object",
+  "required": [
+    "schemaVersion",
+    "name",
+    "nodes"
+  ],
+  "additionalProperties": false,
+  "properties": {
+    "$schema": {
+      "type": "string",
+      "description": "Optional JSON Schema reference for editor autocompletion."
+    },
+    "schemaVersion": {
+      "type": "string",
+      "const": "2",
+      "description": "Schema version. Must be \"2\" for this version of Veld."
+    },
+    "name": {
+      "type": "string",
+      "description": "Human-readable project name. Used in URLs and registry entries.",
+      "pattern": "^[a-zA-Z0-9][a-zA-Z0-9._-]*$"
+    },
+    "url_template": {
+      "type": "string",
+      "description": "URL template for services. Supports placeholders: {service}, {variant}, {run}, {project}, {branch}, {worktree}, {username}, {hostname}. Supports fallback operator: {branch ?? run}.",
+      "default": "{service}.{run}.{project}.localhost",
+      "examples": [
+        "{service}.{run}.{project}.localhost",
+        "{service}.localhost:{port}"
+      ]
+    },
+    "presets": {
+      "type": "object",
+      "description": "Named shortcuts for node:variant selections. Each preset maps to an array of \"node:variant\" strings.",
+      "additionalProperties": {
+        "type": "array",
+        "items": {
+          "type": "string",
+          "description": "A node:variant selector, e.g. \"backend:local\" or \"frontend:docker\".",
+          "pattern": "^[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+$"
+        },
+        "minItems": 1
+      }
+    },
+    "client_log_levels": {
+      "$ref": "#/$defs/ClientLogLevels",
+      "description": "Client-side log levels to capture (project-level default). Overridable at node and variant level."
+    },
+    "features": {
+      "$ref": "#/$defs/FeaturesConfig",
+      "description": "Feature toggles (project-level defaults). Overridable at node and variant level."
+    },
+    "env": {
+      "$ref": "#/$defs/EnvMap",
+      "description": "Global environment variables inherited by all node variants. Overridable at node and variant level."
+    },
+    "setup": {
+      "type": "array",
+      "description": "Steps that run sequentially before the dependency graph executes. If any step exits non-zero, startup is aborted. Not nodes \u2014 no variants, no health checks, no dependency graph participation.",
+      "items": {
+        "$ref": "#/$defs/SetupStep"
+      }
+    },
+    "teardown": {
+      "type": "array",
+      "description": "Steps that run sequentially after all nodes stop (after per-node on_stop hooks). Best-effort: failures are logged but do not block the stop operation. Commands should be idempotent.",
+      "items": {
+        "$ref": "#/$defs/SetupStep"
+      }
+    },
+    "nodes": {
+      "type": "object",
+      "description": "The dependency graph nodes. Each key is a node name, and the value defines its variants.",
+      "additionalProperties": {
+        "$ref": "#/$defs/NodeConfig"
+      },
+      "minProperties": 1
+    }
+  },
+  "$defs": {
+    "SetupStep": {
+      "type": "object",
+      "description": "A lightweight step that runs before the dependency graph (setup) or after all nodes stop (teardown). Supports shell environment variables and project-level Veld variables: ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}.",
+      "required": [
+        "name",
+        "command"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "name": {
+          "type": "string",
+          "description": "Human-readable name for progress reporting and error messages."
+        },
+        "command": {
+          "type": "string",
+          "description": "Shell command to execute. Supports ${veld.name}, ${veld.project}, ${veld.root}, ${veld.run}, and shell environment variables."
+        },
+        "failureMessage": {
+          "type": "string",
+          "description": "Optional human-friendly message shown when the command fails (non-zero exit). Useful for prerequisite checks like 'Docker must be running'."
+        }
+      }
+    },
+    "NodeConfig": {
+      "type": "object",
+      "description": "Configuration for a single node in the dependency graph.",
+      "required": [
+        "variants"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "default_variant": {
+          "type": "string",
+          "description": "The variant to use when none is specified. If omitted and the node has exactly one variant, that variant is used automatically."
+        },
+        "url_template": {
+          "type": "string",
+          "description": "URL template override for all variants of this node. Overrides the project-level url_template."
+        },
+        "hidden": {
+          "type": "boolean",
+          "description": "When true, this node is hidden from `veld nodes` output. Hidden nodes still participate in the dependency graph and run normally.",
+          "default": false
+        },
+        "client_log_levels": {
+          "$ref": "#/$defs/ClientLogLevels",
+          "description": "Client-side log levels override for all variants of this node."
+        },
+        "features": {
+          "$ref": "#/$defs/FeaturesConfig",
+          "description": "Feature toggles override for all variants of this node."
+        },
+        "env": {
+          "$ref": "#/$defs/EnvMap",
+          "description": "Extra environment variables inherited by all variants of this node. Overrides project-level env. Overridable at variant level."
+        },
+        "cwd": {
+          "type": "string",
+          "description": "Working directory for all variants of this node. Relative paths are resolved from the project root. Overridable at variant level. Supports Veld variable substitution."
+        },
+        "variants": {
+          "type": "object",
+          "description": "Available variants for this node. Each key is a variant name.",
+          "additionalProperties": {
+            "$ref": "#/$defs/VariantConfig"
+          },
+          "minProperties": 1
+        }
+      }
+    },
+    "VariantConfig": {
+      "type": "object",
+      "description": "Configuration for a single variant of a node.",
+      "required": [
+        "type"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": [
+            "command",
+            "start_server"
+          ],
+          "description": "Step type. \"command\" runs a command/script to completion. \"start_server\" starts a long-running process and waits for it to become healthy."
+        },
+        "command": {
+          "type": "string",
+          "description": "Inline shell command to execute. Supports Veld variable substitution (e.g. ${veld.port}, ${node.output})."
+        },
+        "script": {
+          "type": "string",
+          "description": "Path to a script file, relative to veld.json. Mutually exclusive with command."
+        },
+        "health_check": {
+          "$ref": "#/$defs/HealthCheck",
+          "description": "Legacy health check configuration. Deprecated: use probes.readiness instead."
+        },
+        "probes": {
+          "$ref": "#/$defs/ProbesConfig",
+          "description": "Readiness and liveness probe configuration. probes.readiness supersedes the legacy health_check field."
+        },
+        "depends_on": {
+          "type": "object",
+          "description": "Dependencies: maps node names to the variant that must be running before this variant starts.",
+          "additionalProperties": {
+            "type": "string"
+          }
+        },
+        "env": {
+          "$ref": "#/$defs/EnvMap",
+          "description": "Extra environment variables injected into the process. Overrides node-level and project-level env."
+        },
+        "outputs": {
+          "description": "Output declarations. For command steps: an array of output names captured from $VELD_OUTPUT_FILE (preferred) or VELD_OUTPUT stdout lines (legacy). For start_server steps: an object mapping output names to template strings.",
+          "oneOf": [
+            {
+              "type": "array",
+              "items": {
+                "type": "string"
+              },
+              "description": "Declared output names for command steps."
+            },
+            {
+              "type": "object",
+              "additionalProperties": {
+                "type": "string"
+              },
+              "description": "Synthetic output templates for start_server steps."
+            }
+          ]
+        },
+        "sensitive_outputs": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Output keys whose values are sensitive. These are masked in logs and encrypted at rest."
+        },
+        "strict_outputs": {
+          "type": "boolean",
+          "default": true,
+          "description": "When true (default), fail if the command produces output keys (via $VELD_OUTPUT_FILE or stdout) not declared in \"outputs\". Set to false to silently ignore undeclared outputs."
+        },
+        "skip_if": {
+          "type": "string",
+          "description": "Idempotency check command. Only applies to \"command\" type variants. If this command exits 0, the step is considered already complete and is skipped. Previously named \"verify\"."
+        },
+        "verify": {
+          "type": "string",
+          "description": "Deprecated: use skip_if instead. Alias for skip_if, kept for backward compatibility."
+        },
+        "url_template": {
+          "type": "string",
+          "description": "URL template override for this specific variant. Overrides both node-level and project-level url_template."
+        },
+        "on_stop": {
+          "type": "string",
+          "description": "Teardown command to run when the environment is stopped. Executed in reverse dependency order. Supports Veld variable substitution."
+        },
+        "client_log_levels": {
+          "$ref": "#/$defs/ClientLogLevels",
+          "description": "Client-side log levels override for this specific variant."
+        },
+        "features": {
+          "$ref": "#/$defs/FeaturesConfig",
+          "description": "Feature toggles override for this specific variant."
+        },
+        "cwd": {
+          "type": "string",
+          "description": "Working directory for this variant. Relative paths are resolved from the project root. Overrides node-level cwd. Supports Veld variable substitution."
+        }
+      },
+      "allOf": [
+        {
+          "if": {
+            "properties": {
+              "type": {
+                "const": "start_server"
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          "then": {
+            "required": [
+              "command"
+            ]
+          }
+        },
+        {
+          "if": {
+            "properties": {
+              "type": {
+                "const": "command"
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          "then": {
+            "anyOf": [
+              {
+                "required": [
+                  "command"
+                ]
+              },
+              {
+                "required": [
+                  "script"
+                ]
+              }
+            ]
+          }
+        }
+      ]
+    },
+    "ProbesConfig": {
+      "type": "object",
+      "description": "Readiness and liveness probe configuration. Available for both command and start_server variants.",
+      "additionalProperties": false,
+      "properties": {
+        "readiness": {
+          "$ref": "#/$defs/HealthCheck",
+          "description": "Readiness probe \u2014 gates the dependency graph during startup. For start_server nodes: checked after process starts. For command nodes: checked after command exits 0."
+        },
+        "liveness": {
+          "$ref": "#/$defs/LivenessProbe",
+          "description": "Liveness probe \u2014 runs continuously after the node is healthy. Triggers recovery when failure_threshold consecutive checks fail."
+        }
+      }
+    },
+    "LivenessProbe": {
+      "type": "object",
+      "description": "Liveness probe configuration. Runs periodically after a node becomes healthy to detect failures (e.g., dropped SSH tunnels, crashed background processes).",
+      "required": [
+        "type"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": [
+            "http",
+            "port",
+            "command"
+          ],
+          "description": "Liveness check strategy. \"http\" polls an HTTP endpoint. \"port\" checks if the port is accepting connections. \"command\" runs a command and checks the exit code."
+        },
+        "path": {
+          "type": "string",
+          "description": "HTTP path to poll (type \"http\" only).",
+          "examples": [
+            "/health",
+            "/ready",
+            "/"
+          ]
+        },
+        "expect_status": {
+          "type": "integer",
+          "description": "Expected HTTP status code (type \"http\" only). Defaults to 200.",
+          "minimum": 100,
+          "maximum": 599,
+          "default": 200
+        },
+        "command": {
+          "type": "string",
+          "description": "Shell command to run (type \"command\" only). Exit code 0 means healthy."
+        },
+        "interval_ms": {
+          "type": "integer",
+          "description": "Milliseconds between liveness checks.",
+          "default": 5000,
+          "minimum": 1000
+        },
+        "failure_threshold": {
+          "type": "integer",
+          "description": "Number of consecutive failures before triggering recovery.",
+          "default": 3,
+          "minimum": 1
+        },
+        "max_recoveries": {
+          "type": "integer",
+          "description": "Maximum number of recovery attempts before the node is permanently marked as failed.",
+          "default": 3,
+          "minimum": 0
+        }
+      }
+    },
+    "FeaturesConfig": {
+      "type": "object",
+      "description": "Feature toggles for controlling which Veld capabilities are injected into a server's HTML responses. All properties are optional \u2014 omitted values inherit from the parent level (variant inherits from node, node from project). The built-in default for all features is true (enabled).",
+      "additionalProperties": false,
+      "properties": {
+        "feedback_overlay": {
+          "type": "boolean",
+          "description": "Inject the feedback overlay toolbar (FAB, screenshot, comments) into HTML responses. Default: true.",
+          "default": true
+        },
+        "client_logs": {
+          "type": "boolean",
+          "description": "Inject the client-side log collector into HTML responses. Default: true.",
+          "default": true
+        },
+        "inject": {
+          "type": "boolean",
+          "description": "Automatically inject bootstrap scripts into HTML responses. When false, the /__veld__/* proxy routes are still created so you can manually add <script> tags. Default: true.",
+          "default": true
+        }
+      }
+    },
+    "ClientLogLevels": {
+      "type": "array",
+      "description": "Client-side console log levels to capture. Supported values: \"log\", \"warn\", \"error\", \"info\", \"debug\". Unhandled exceptions (\"exception\") are always captured regardless of this setting.",
+      "items": {
+        "type": "string",
+        "enum": [
+          "log",
+          "warn",
+          "error",
+          "info",
+          "debug"
+        ]
+      },
+      "default": [
+        "log",
+        "warn",
+        "error"
+      ],
+      "uniqueItems": true
+    },
+    "EnvMap": {
+      "type": "object",
+      "description": "A map of environment variable names to values. Values support Veld variable substitution (e.g. ${veld.port}, ${nodes.backend.url}).",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "HealthCheck": {
+      "type": "object",
+      "description": "Health check / readiness probe configuration.",
+      "required": [
+        "type"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": [
+            "http",
+            "port",
+            "command"
+          ],
+          "description": "Health check strategy. \"http\" polls an HTTP endpoint. \"port\" checks if the port is accepting connections. \"command\" runs a command and checks the exit code."
+        },
+        "path": {
+          "type": "string",
+          "description": "HTTP path to poll (type \"http\" only).",
+          "examples": [
+            "/health",
+            "/ready",
+            "/"
+          ]
+        },
+        "expect_status": {
+          "type": "integer",
+          "description": "Expected HTTP status code (type \"http\" only). Defaults to 200.",
+          "minimum": 100,
+          "maximum": 599,
+          "default": 200
+        },
+        "command": {
+          "type": "string",
+          "description": "Shell command to run (type \"command\" only). Exit code 0 means healthy."
+        },
+        "timeout_seconds": {
+          "type": "integer",
+          "description": "Maximum seconds to wait for the service to become healthy.",
+          "default": 60,
+          "minimum": 1
+        },
+        "interval_ms": {
+          "type": "integer",
+          "description": "Milliseconds between health check attempts.",
+          "default": 1000,
+          "minimum": 100
+        }
+      }
+    }
+  }
+}
diff --git a/skills/veld/SKILL.md b/skills/veld/SKILL.md
index c4b3d31..299a161 100644
--- a/skills/veld/SKILL.md
+++ b/skills/veld/SKILL.md
@@ -65,24 +65,30 @@ For the full config schema, variables, and node types, see [reference/config.md]
 
 Quick reference for the two node types:
 
-**`start_server`** — long-running process. Must bind to `${veld.port}`. Requires `health_check`.
+**`start_server`** — long-running process. Must bind to `${veld.port}`. Requires a readiness probe (`probes.readiness` or legacy `health_check`).
 ```json
 {
   "type": "start_server",
   "command": "npm run dev -- --port ${veld.port}",
-  "health_check": { "type": "http", "path": "/health" },
+  "probes": {
+    "readiness": { "type": "http", "path": "/health" },
+    "liveness": { "type": "http", "path": "/health", "interval_ms": 5000 }
+  },
   "depends_on": { "database": "docker" },
   "env": { "DATABASE_URL": "${nodes.database.DATABASE_URL}" }
 }
 ```
 
-**`command`** — run-to-completion. Emits outputs via `$VELD_OUTPUT_FILE`.
+**`command`** — run-to-completion. Emits outputs via `$VELD_OUTPUT_FILE`. Supports liveness probes for long-lived resources (e.g., SSH tunnels).
 ```json
 {
   "type": "command",
   "script": "./scripts/setup.sh",
   "outputs": ["DATABASE_URL"],
-  "verify": "./scripts/check.sh"
+  "skip_if": "./scripts/check.sh",
+  "probes": {
+    "liveness": { "type": "command", "command": "pg_isready", "interval_ms": 5000 }
+  }
 }
 ```
 
@@ -92,9 +98,28 @@ For the full feedback workflow, events, thread fields, interactive controls, and
 
 Core pattern: listen (returns all pending feedback at once) → fix → release with status comment → listen again with `--after <last_seq>` → repeat until `session_ended`. Threads are auto-claimed so multiple agents can work in parallel without conflicts.
 
+## Reading Outputs
+
+After starting an environment, read node outputs (database URLs, ports, credentials, etc.):
+
+```sh
+veld status --outputs --name my-feature        # human-readable
+veld status --outputs --json --name my-feature  # machine-readable
+```
+
+To debug liveness probe failures and recovery decisions:
+```sh
+veld logs --source internal --name my-feature     # shows probe stderr, recovery attempts
+veld logs --source internal -f --name my-feature  # follow mode
+```
+
+**Outputs can change after a recovery restart.** When a liveness probe triggers recovery (e.g., SSH tunnel drops and the DB clone restarts), the restarted node may produce new outputs (different port, new password, new connection string). Always re-read outputs with `veld status --outputs` after a restart rather than caching them. If you observe connection failures to a previously-working service, check whether a recovery happened and refresh your outputs.
+
 ## Gotchas
 
-- **`health_check` is required** on every `start_server` variant — veld will reject config without it
+- **Readiness probe is required** on every `start_server` variant — use `probes.readiness` (preferred) or legacy `health_check`
+- **`skip_if` replaces `verify`** — `verify` still works as an alias but `skip_if` is the canonical name
+- **Outputs are volatile** — after a recovery restart, outputs like `DATABASE_URL` may change. Never cache outputs long-term; re-read with `veld status --outputs` when needed
 - **`depends_on` needs the variant** — write `"backend": "local"`, not just `"backend"`
 - **`${...}` vs `{...}`** — `${veld.port}` in commands/env, `{service}` in URL templates. Mixing them up silently produces wrong values.
 - **`outputs` shape differs by type** — object (`{"KEY": "template"}`) for `start_server`, array (`["KEY"]`) for `command`
diff --git a/skills/veld/reference/config.md b/skills/veld/reference/config.md
index 1bd9122..442ffc7 100644
--- a/skills/veld/reference/config.md
+++ b/skills/veld/reference/config.md
@@ -4,8 +4,8 @@
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "myproject",
   "url_template": "{service}.{run}.{project}.localhost",
   "setup": [],
@@ -40,13 +40,16 @@ Variables: `${veld.name}`, `${veld.project}`, `${veld.root}`, `${veld.run}`, plu
 
 ### `start_server` — Long-running processes
 
-Must bind to `${veld.port}`. Requires `health_check`.
+Must bind to `${veld.port}`. Requires a readiness probe (`probes.readiness` or legacy `health_check`).
 
 ```json
 {
   "type": "start_server",
   "command": "npm run dev -- --port ${veld.port}",
-  "health_check": { "type": "http", "path": "/health", "timeout_seconds": 30 },
+  "probes": {
+    "readiness": { "type": "http", "path": "/health", "timeout_seconds": 30 },
+    "liveness": { "type": "http", "path": "/health", "interval_ms": 5000 }
+  },
   "depends_on": { "database": "docker" },
   "env": { "DATABASE_URL": "${nodes.database.DATABASE_URL}" },
   "outputs": { "DATABASE_URL": "postgresql://postgres:veld@localhost:${veld.port}/app" },
@@ -64,13 +67,18 @@ Emits outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE`.
   "type": "command",
   "script": "./scripts/clone-db.sh",
   "outputs": ["DATABASE_URL", "DB_NAME"],
-  "verify": "./scripts/verify-db.sh"
+  "skip_if": "./scripts/verify-db.sh",
+  "probes": {
+    "liveness": { "type": "command", "command": "pg_isready", "interval_ms": 5000 }
+  }
 }
 ```
 
-## Health Checks
+## Probes
 
-Every `start_server` variant requires one. Three types:
+### Readiness (startup)
+
+Every `start_server` variant requires a readiness probe. Use `probes.readiness` (preferred) or legacy `health_check`. Three types:
 
 ```json
 { "type": "http", "path": "/health", "expect_status": 200, "timeout_seconds": 30 }
@@ -83,59 +91,29 @@ Every `start_server` variant requires one. Three types:
 - `command`: Exit 0 = healthy.
 - Defaults: `timeout_seconds`: 60, `interval_ms`: 1000 (min: 100).
 
-## Variable Interpolation
-
-### In commands, scripts, env values: `${...}`
-
-| Variable | Description |
-|----------|-------------|
-| `${veld.port}` | Allocated port (`start_server` only) |
-| `${veld.url}` | Full HTTPS URL (`start_server` only) |
-| `${veld.url.hostname}` | DNS name only |
-| `${veld.url.host}` | hostname:port |
-| `${veld.url.origin}` | scheme + host |
-| `${veld.url.scheme}` | Protocol (`https`) |
-| `${veld.url.port}` | HTTPS port |
-| `${veld.run}` | Run name |
-| `${veld.root}` | Absolute path to veld.json directory |
-| `${veld.project}` | Project name |
-| `${veld.branch}` | Current git branch (slugified) |
-| `${veld.worktree}` | Worktree directory name (slugified) |
-| `${veld.username}` | OS username |
-| `${nodes.<node>.<output>}` | Output from another node |
-| `${nodes.<node>.url}` | HTTPS URL of a start_server node |
-| `${nodes.<node>.port}` | Allocated port of a start_server node |
-
-Qualified references when two variants run simultaneously: `${nodes.backend:local.url}`.
-
-### In URL templates: `{...}`
-
-`{service}`, `{run}`, `{project}`, `{branch}`, `{worktree}`, `{username}`, `{hostname}`
-
-Fallback operator: `{branch ?? run}` — uses first non-empty value.
-
-Cascades: variant > node > project level.
+### Liveness (ongoing)
 
-## Dependencies
-
-Explicit `node → variant` mapping. Default variants are **never** silently assumed.
+Runs continuously after a node becomes healthy. Available for both `command` and `start_server` types. Same three check types as readiness: `http`, `port`, `command` (arbitrary shell command, exit 0 = healthy).
 
 ```json
-"depends_on": { "database": "docker", "backend": "local" }
+"probes": {
+  "liveness": {
+    "type": "command",
+    "command": "pg_isready -h localhost -p 5432",
+    "interval_ms": 5000,
+    "failure_threshold": 3,
+    "max_recoveries": 3
+  }
+}
 ```
 
-Dependencies start before dependents. Independent branches run in parallel. Teardown is reverse order.
-
-## Presets
-
-Named shortcuts for common selections:
+- `type`: `"http"`, `"port"`, or `"command"` — same semantics as readiness probes
+- `command`: Shell command run via `sh -c`. Node outputs are available as env vars (e.g., `$DB_HOST`). Pipes, redirects, `&&` chains all work. 30s timeout.
+- `interval_ms`: Check interval (default: 5000, min: 1000)
+- `failure_threshold`: Consecutive failures before recovery (default: 3)
+- `max_recoveries`: Max recovery attempts before permanent failure (default: 3)
 
-```json
-"presets": {
-  "fullstack": ["frontend:local", "backend:local", "database:docker"],
-  "ui-only": ["frontend:local", "backend:staging"]
-}
-```
+Recovery = full environment restart (`veld restart`). After `max_recoveries` exhausted, node is permanently failed.
 
 ## Other Fields
 
@@ -150,3 +128,5 @@ Named shortcuts for common selections:
 | `features` | project, node, variant | `{"feedback_overlay": bool, "client_logs": bool, "inject": bool}`. All default `true`. |
 | `on_stop` | variant | Per-node teardown command that runs on `veld stop`. |
 | `sensitive_outputs` | variant | Output keys to mask in logs and encrypt at rest. |
+| `skip_if` | variant (`command` only) | Idempotency check — skip step if exits 0. Alias: `verify`. |
+| `probes` | variant | `{readiness?: HealthCheck, liveness?: LivenessProbe}`. Available for both node types. |
diff --git a/tests/validate-schema.sh b/tests/validate-schema.sh
index c52890b..c3976ba 100755
--- a/tests/validate-schema.sh
+++ b/tests/validate-schema.sh
@@ -10,7 +10,8 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 
-SCHEMA="$REPO_ROOT/schema/v1/veld.schema.json"
+SCHEMA_V1="$REPO_ROOT/schema/v1/veld.schema.json"
+SCHEMA_V2="$REPO_ROOT/schema/v2/veld.schema.json"
 CHECK="python3 -m check_jsonschema"
 
 if [[ "${1:-}" == "--install" ]]; then
@@ -42,18 +43,29 @@ run_check() {
 echo "=== JSON Schema Validation ==="
 echo
 
-echo "1) Meta-schema: validating schema/v1/veld.schema.json against JSON Schema draft 2020-12"
-run_check "veld.schema.json is valid" \
-  $CHECK --check-metaschema "$SCHEMA"
+echo "1) Meta-schema: validating schema files against JSON Schema draft 2020-12"
+run_check "schema/v1/veld.schema.json is valid" \
+  $CHECK --check-metaschema "$SCHEMA_V1"
+run_check "schema/v2/veld.schema.json is valid" \
+  $CHECK --check-metaschema "$SCHEMA_V2"
 
 echo
-echo "2) Instance validation: checking project configs against the schema"
+echo "2) Instance validation: checking project configs against their schema version"
 
 # Find all veld.json files in the repo (excluding node_modules, target, etc.)
 while IFS= read -r config; do
   rel="${config#"$REPO_ROOT/"}"
-  run_check "$rel" \
-    $CHECK --schemafile "$SCHEMA" "$config"
+
+  # Pick the schema based on the file's schemaVersion field.
+  version=$(python3 -c "import json; print(json.load(open('$config')).get('schemaVersion', '1'))" 2>/dev/null || echo "1")
+  if [[ "$version" == "2" ]]; then
+    schema="$SCHEMA_V2"
+  else
+    schema="$SCHEMA_V1"
+  fi
+
+  run_check "$rel (v$version)" \
+    $CHECK --schemafile "$schema" "$config"
 done < <(find "$REPO_ROOT" -name "veld.json" \
   -not -path "*/node_modules/*" \
   -not -path "*/target/*" \
diff --git a/website/llms-full.txt b/website/llms-full.txt
index e761977..5c95937 100644
--- a/website/llms-full.txt
+++ b/website/llms-full.txt
@@ -59,8 +59,8 @@ cargo build --release
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "myproject",
   "url_template": "{service}.{run}.{project}.localhost",
   "nodes": {
@@ -189,7 +189,8 @@ Captured logs appear alongside server logs in `veld logs` and the management UI.
 ```sh
 veld logs --source client          # Browser logs only
 veld logs --source server          # Server logs only
-veld logs --source all             # Both (default)
+veld logs --source internal        # Liveness probe & recovery logs
+veld logs --source all             # All sources (default)
 ```
 
 ### Configuration
@@ -223,7 +224,7 @@ Available features: `feedback_overlay` (toolbar/comments UI), `client_logs` (bro
 | Field | Type | Required | Description |
 |-------|------|----------|-------------|
 | `$schema` | string | No | JSON Schema URL for editor autocompletion |
-| `schemaVersion` | string | Yes | Must be `"1"` |
+| `schemaVersion` | string | Yes | `"1"` or `"2"` (use `"2"` for new projects) |
 | `name` | string | Yes | Project name (used in URLs) |
 | `url_template` | string | No | URL template for services |
 | `presets` | object | No | Named shortcuts for node:variant selections |
@@ -282,7 +283,7 @@ Runs a shell command or script to completion. Used for setup tasks.
 - Can emit outputs by writing `key=value` lines to `$VELD_OUTPUT_FILE` (preferred) or via `VELD_OUTPUT key=value` on stdout (legacy, discouraged)
 - Declares outputs as an array of strings
 - Built-in output: `exit_code`
-- Supports `verify` for idempotency
+- Supports `skip_if` for idempotency (alias: `verify`)
 - Does NOT get a port allocated — `${veld.port}` is not available
 
 ```json
@@ -432,7 +433,8 @@ Usage: `veld start --preset fullstack --name my-feature`
 | Field | Type | Applies To | Description |
 |-------|------|------------|-------------|
 | `on_stop` | string | All | Teardown command on `veld stop` |
-| `verify` | string | `command` only | Idempotency check (exit 0 = skip) |
+| `skip_if` | string | `command` only | Idempotency check (exit 0 = skip). Alias: `verify` |
+| `probes` | object | All | `{readiness?: HealthCheck, liveness?: LivenessProbe}` — supersedes `health_check` |
 | `sensitive_outputs` | array | All | Output keys to mask and encrypt |
 | `client_log_levels` | array | `start_server` | Browser log levels override (variant > node > project) |
 | `features` | object | `start_server` | Feature toggles override (variant > node > project) |
@@ -445,8 +447,8 @@ Usage: `veld start --preset fullstack --name my-feature`
 
 ```json
 {
-  "$schema": "https://veld.oss.life.li/schema/v1/veld.schema.json",
-  "schemaVersion": "1",
+  "$schema": "https://veld.oss.life.li/schema/v2/veld.schema.json",
+  "schemaVersion": "2",
   "name": "myproject",
   "url_template": "{service}.{run}.myproject.localhost",
   "presets": {