edgelesssys · sespiros · Apr 3, 2026 · Apr 3, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -96,8 +96,6 @@ jobs:
           just get-credentials
       - name: E2E Test
         run: |
-          nix build ".#${SET}.scripts.get-logs"
-          nix run ".#${SET}.scripts.get-logs" start workspace/just.namespace &
           just e2e "${TEST_NAME}"
       - name: Check for skipped
         id: skipped
@@ -111,7 +109,7 @@ jobs:
       - name: Download logs
         if: always() && steps.skipped.outputs.skipped == 'false'
         run: |
-          nix run ".#${SET}.scripts.get-logs" download workspace/just.namespace
+          just download-logs
       - name: Upload logs
         if: always() && steps.skipped.outputs.skipped == 'false'
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0

diff --git a/dev-docs/e2e/debugging.md b/dev-docs/e2e/debugging.md
@@ -0,0 +1,109 @@
+# Debugging e2e failures
+
+## Collecting logs
+
+### After a `just e2e` run
+
+`just e2e` deploys a log-collector DaemonSet that streams pod logs from the
+start. After the test finishes (pass or fail), download the logs:
+
+```bash
+just download-logs
+```
+
+Logs are written to `workspace/logs/`.
+
+### After a manual deployment (`just`)
+
+If you deployed with `just` (the default target) and want to collect logs:
+
+```bash
+just download-logs
+```
+
+This deploys the log-collector DaemonSet (if not already running), collects
+host-level journal entries, and downloads everything.
+
+### In CI
+
+CI runs `just download-logs` automatically after every e2e test. Logs are
+uploaded as GitHub Actions artifacts. To find them: go to the workflow run,
+scroll to the bottom of the run summary page, and look for artifacts named
+`e2e_pod_logs-<platform>-<test>` (for example, `e2e_pod_logs-Metal-QEMU-SNP-openssl`).
+Alternatively you can expand the "Upload logs" step in a particular test and
+get the Artifact download URL.
+
+## Log structure
+
+```
+workspace/logs/
+├── <namespace>_<pod>_<uid>/       # pod container logs
+│   └── <container>/0.log
+├── host/<node-name>/              # host-level journal logs (per node)
+│   ├── kernel.log                 # journalctl -k (SEV-ES termination, VFIO/IOMMU)
+│   ├── k3s.log                    # journalctl -u k3s (k3s-specific kubelet/containerd)
+│   ├── kubelet.log                # journalctl -u kubelet (non-k3s runners)
+│   ├── containerd.log             # journalctl -u containerd (non-k3s runners)
+│   └── kata.log                   # journalctl -t kata (QEMU lifecycle, register dumps)
+├── metadata/<node-name>/
+│   └── sandbox-map.txt            # CVM pod name -> kata sandbox ID
+└── <namespace>-k8s-events.yaml    # kubernetes events
+```
+
+Host logs are time-scoped to the namespace creation time, so they only contain
+entries relevant to the test run.
+
+## Debugging CVM failures
+
+CVM boot failures (for example, SEV-ES termination, OVMF crashes) leave no trace in
+pod logs -the guest never starts. Look at host-level logs instead:
+
+1. **kernel.log** -look for `SEV-ES guest requested termination`, VFIO/IOMMU
+   errors, or KVM failures.
+2. **kata.log** -look for `detected guest crash`, QEMU launch arguments,
+   register dumps, and console output (`vmconsole=` lines contain guest serial
+   output).
+3. **k3s.log** -look for `task is in unknown state` or containerd errors that
+   indicate the CVM process died.
+
+## Tracing a pod to its sandbox in kata.log
+
+kata.log contains interleaved logs from all sandboxes. The collected metadata
+file (`metadata/sandbox-map.txt`) maps CVM pod names to kata sandbox IDs.
+The sandbox map only includes pods that are still running at log collection time.
+Pods that might have been deleted earlier in the test (one such example is the
+regression test which creates and tears down multiple rounds of pods) won't have entries.
+
+1. Find the sandbox ID for a pod:
+
+```bash
+cat workspace/logs/metadata/*/sandbox-map.txt
+# coordinator-0                     f4bb878b2e58bd3bd5a89fe2bc99b7368fc6aa070a0b8490a5c69a7c9816be65
+# openssl-backend-757688b785-dvr4c  3658285f5581ad51...
+# openssl-frontend-575dfdbb89-srwvr 828d8660496f6ac4...
+```
+
+2. Filter kata.log for a specific pod's sandbox:
+
+```bash
+sandbox=$(grep coordinator workspace/logs/metadata/*/sandbox-map.txt | awk '{print $2}')
+grep "$sandbox" workspace/logs/host/kata.log
+```
+
+### Fallback: Finding sandboxes by runtime class hash
+
+If a pod is missing from the sandbox map (deleted before log collection), you
+can find its sandbox ID using the runtime class hash from kata.log. The hash
+is the last component of the runtime class name (for example, `d17bc85e` from
+`contrast-cc-metal-qemu-snp-d17bc85e`):
+
+```bash
+grep "d17bc85e" workspace/logs/host/*/kata.log | grep -oP 'sandbox=\K[a-f0-9]+' | sort -u
+```
+
+This lists all sandbox IDs for that runtime class. Cross-reference with the
+sandbox map to identify which ones are unmapped.
+
+Note that some kata log lines (config loading, factory init, device cold plug)
+don't have a sandbox ID. These are shared across all CVMs and may be relevant
+for debugging startup failures.
diff --git a/justfile b/justfile
@@ -31,6 +31,21 @@ debugshell: (push "debugshell")
 
 k8s-log-collector: (push "k8s-log-collector")
 
+# Download all logs (pod logs + host journal). Deploys the log-collector if not already running.
+download-logs set=default_set:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    # Only push if not already pushed (e.g. by _e2e).
+    if ! grep -q "k8s-log-collector" "{{ workspace_dir }}/just.containerlookup" 2>/dev/null; then
+      just k8s-log-collector
+    fi
+    namespace_file="{{ workspace_dir }}/just.namespace"
+    if [[ ! -f "$namespace_file" ]]; then
+      echo "No namespace file found at $namespace_file. Deploy something first." >&2
+      exit 1
+    fi
+    nix run .#{{ set }}.scripts.get-logs -- download "$namespace_file"
+
 containerd-reproducer set=default_set:
     #!/usr/bin/env bash
     set -euo pipefail
@@ -80,7 +95,7 @@ e2e target=default_deploy_target platform=default_platform set=default_set:
     echo "Using set=$RESOLVED_SET for test '{{ target }}'"
     set="$RESOLVED_SET" just _e2e {{ target }} {{ platform }}
 
-_e2e target=default_deploy_target platform=default_platform set=default_set: soft-clean coordinator initializer openssl port-forwarder service-mesh-proxy memdump debugshell (node-installer platform)
+_e2e target=default_deploy_target platform=default_platform set=default_set: soft-clean coordinator initializer openssl port-forwarder service-mesh-proxy memdump debugshell k8s-log-collector (node-installer platform)
     #!/usr/bin/env bash
     set -euo pipefail
     if [[ {{ platform }} == "Metal-QEMU-SNP-GPU" || {{ platform }} == "Metal-QEMU-TDX-GPU" ]] ; then
@@ -97,6 +112,10 @@ _e2e target=default_deploy_target platform=default_platform set=default_set: sof
     if [[ {{ target }} == "containerd-11644-reproducer" ]]; then
         just containerd-reproducer
     fi
+    get_logs=$(nix build .#{{ set }}.scripts.get-logs --no-link --print-out-paths)
+    "$get_logs/bin/get-logs" start ./{{ workspace_dir }}/just.namespace &
+    get_logs_pid=$!
+    trap 'kill $get_logs_pid || true' EXIT
     nix shell .#{{ set }}.contrast.e2e --command {{ target }}.test -test.v \
             --image-replacements ./{{ workspace_dir }}/just.containerlookup \
             --namespace-file ./{{ workspace_dir }}/just.namespace \

diff --git a/packages/by-name/k8s-log-collector/collect-host-logs.sh b/packages/by-name/k8s-log-collector/collect-host-logs.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright 2026 Edgeless Systems GmbH
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+since="${1:?usage: collect-host-logs <since>}"
+node="${NODE_NAME:?NODE_NAME must be set}"
+mkdir -p "/export/logs/host/$node"
+echo "Collecting kernel logs (since $since)..." >&2
+journalctl --directory=/journal -k --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kernel.log || rm -f /export/logs/host/"$node"/kernel.log
+echo "Collecting k3s logs (since $since)..." >&2
+journalctl --directory=/journal -u k3s --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/k3s.log || rm -f /export/logs/host/"$node"/k3s.log
+echo "Collecting kubelet logs (since $since)..." >&2
+journalctl --directory=/journal -u kubelet --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kubelet.log || rm -f /export/logs/host/"$node"/kubelet.log
+echo "Collecting containerd logs (since $since)..." >&2
+journalctl --directory=/journal -u containerd --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/containerd.log || rm -f /export/logs/host/"$node"/containerd.log
+echo "Collecting kata logs (since $since)..." >&2
+journalctl --directory=/journal -t kata --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kata.log || rm -f /export/logs/host/"$node"/kata.log
+echo "Collecting pod-sandbox metadata..." >&2
+mkdir -p "/export/logs/metadata/$node"
+for sock in /run/k3s/containerd/containerd.sock /run/containerd/containerd.sock; do
+  if [[ -S $sock ]]; then
+    CONTAINER_RUNTIME_ENDPOINT="unix://$sock" crictl pods -o json 2>/dev/null |
+      jq -r --arg ns "${POD_NAMESPACE:-}" \
+        '.items[] | select(.metadata.namespace == $ns and .runtimeHandler != "" and .runtimeHandler != null) | "\(.metadata.name)\t\(.id)"' \
+        >"/export/logs/metadata/$node/sandbox-map.txt"
+    break
+  fi
+done
+echo "Host log collection complete." >&2
diff --git a/packages/by-name/k8s-log-collector/script.sh → ...ame/k8s-log-collector/collect-pod-logs.sh b/packages/by-name/k8s-log-collector/script.sh → ...ame/k8s-log-collector/collect-pod-logs.sh
diff --git a/packages/by-name/k8s-log-collector/package.nix b/packages/by-name/k8s-log-collector/package.nix
@@ -3,21 +3,50 @@
 
 {
   writeShellApplication,
+  symlinkJoin,
   inotify-tools,
   coreutils,
   findutils,
   gnused,
   gnugrep,
+  systemdMinimal,
+  cri-tools,
+  jq,
 }:
 
-writeShellApplication {
-  name = "collect-logs";
-  runtimeInputs = [
-    inotify-tools
-    coreutils
-    findutils
-    gnugrep
-    gnused
+let
+  collect-pod-logs = writeShellApplication {
+    name = "collect-pod-logs";
+    runtimeInputs = [
+      inotify-tools
+      coreutils
+      findutils
+      gnugrep
+      gnused
+    ];
+    text = builtins.readFile ./collect-pod-logs.sh;
+  };
+
+  # systemdMinimal disables all compression by default, but we need it
+  # to read host journal files that may be compressed with LZ4/ZSTD.
+  systemdWithJournal = systemdMinimal.override { withCompression = true; };
+
+  collect-host-logs = writeShellApplication {
+    name = "collect-host-logs";
+    runtimeInputs = [
+      coreutils
+      systemdWithJournal
+      cri-tools
+      jq
+    ];
+    text = builtins.readFile ./collect-host-logs.sh;
+  };
+in
+
+symlinkJoin {
+  name = "k8s-log-collector";
+  paths = [
+    collect-pod-logs
+    collect-host-logs
   ];
-  text = builtins.readFile ./script.sh;
 }
diff --git a/packages/by-name/scripts/get-logs/get-logs.sh b/packages/by-name/scripts/get-logs/get-logs.sh
@@ -33,9 +33,7 @@ deploy_collectors() {
   trap cleanup INT TERM EXIT
   tail -n +1 -f "$namespace_file" |
     while IFS= read -r namespace; do
-      cp ./packages/log-collector.yaml ./workspace/log-collector.yaml
-      echo "Starting log collector in namespace $namespace" >&2
-      retry kubectl apply -n "$namespace" -f ./workspace/log-collector.yaml
+      deploy_to_namespace "$namespace"
     done
 }
 
@@ -50,6 +48,18 @@ kill_deploy_collectors() {
   wait "$deploy_pid" 2>/dev/null || true
 }
 
+deploy_to_namespace() {
+  local namespace="$1"
+  cp ./packages/log-collector.yaml ./workspace/log-collector.yaml
+  replacement=$(grep "k8s-log-collector:latest=" ./workspace/just.containerlookup 2>/dev/null | tail -1 | cut -d= -f2- || true)
+  if [[ -n $replacement ]]; then
+    echo "Using pushed log-collector image: $replacement" >&2
+    sed -i "s|image: .*k8s-log-collector.*|image: \"$replacement\"|" ./workspace/log-collector.yaml
+  fi
+  echo "Starting log collector in namespace $namespace" >&2
+  retry kubectl apply -n "$namespace" -f ./workspace/log-collector.yaml
+}
+
 if [[ $# -lt 2 ]]; then
   echo "Usage: get-logs [start | download] namespaceFile"
   exit 1
@@ -73,7 +83,14 @@ download)
   mkdir -p "./workspace/logs"
   log_pods_missing=false
   while read -r namespace; do
+    start_time=$(kubectl get ns "$namespace" -o jsonpath='{.metadata.creationTimestamp}')
+
     pods="$(kubectl get pods -o name -n "$namespace" | grep log-collector | cut -c 5- || true)"
+    if [[ -z $pods ]]; then
+      deploy_to_namespace "$namespace"
+      kubectl rollout status daemonset/log-collector -n "$namespace" --timeout=60s
+      pods="$(kubectl get pods -o name -n "$namespace" | grep log-collector | cut -c 5- || true)"
+    fi
     if [[ -z $pods ]]; then
       echo "No log-collector pods found in namespace $namespace" >&2
       log_pods_missing=true
@@ -83,12 +100,23 @@ download)
       echo "Collecting logs from namespace $namespace, pod $pod" >&2
       retry kubectl wait --for=condition=Ready -n "$namespace" "pod/$pod"
       echo "Pod $pod is ready" >&2
-      retry kubectl exec -n "$namespace" "$pod" -- /bin/bash -c "rm -f /exported-logs.tar.gz; cp -r /export /export-no-stream; tar zcvf /exported-logs.tar.gz /export-no-stream; rm -rf /export-no-stream"
+
+      echo "Collecting host-level logs (since $start_time)..." >&2
+      retry kubectl exec -n "$namespace" "$pod" -- \
+        collect-host-logs "$start_time" || true
+
+      retry kubectl exec -n "$namespace" "$pod" -- /bin/bash -c '
+        rm -f /exported-logs.tar.gz
+        cp -r /export /export-no-stream
+        find /export-no-stream -empty -delete
+        tar zcvf /exported-logs.tar.gz /export-no-stream
+        rm -rf /export-no-stream
+      '
       retry kubectl cp -n "$namespace" "$pod:/exported-logs.tar.gz" ./workspace/logs/exported-logs.tar.gz
       echo "Downloaded logs tarball for namespace $namespace, pod $pod, extracting..." >&2
       tar xzvf ./workspace/logs/exported-logs.tar.gz --directory "./workspace/logs"
       rm ./workspace/logs/exported-logs.tar.gz
-      mv ./workspace/logs/export-no-stream/logs/* ./workspace/logs/
+      cp -rn ./workspace/logs/export-no-stream/logs/* ./workspace/logs/
       rm -rf ./workspace/logs/export-no-stream
     done
     echo "Collecting Kubernetes events for namespace $namespace" >&2

diff --git a/packages/containers.nix b/packages/containers.nix
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: BUSL-1.1
 
 {
-  lib,
   pkgs,
   contrastPkgs,
   dockerTools,
@@ -165,9 +164,10 @@
       coreutils
       gnutar
       gzip
+      contrastPkgs.k8s-log-collector
     ];
     config = {
-      Cmd = [ "${lib.getExe contrastPkgs.k8s-log-collector}" ];
+      Cmd = [ "${contrastPkgs.k8s-log-collector}/bin/collect-pod-logs" ];
       Volumes."/logs" = { };
     };
   };

diff --git a/packages/log-collector.yaml b/packages/log-collector.yaml
@@ -29,17 +29,43 @@ spec:
             - mountPath: /logs
               name: log-volume
               readOnly: true
+            - mountPath: /journal
+              name: journal-volume
+              readOnly: true
+            - mountPath: /run/containerd
+              name: containerd-run
+              readOnly: true
+            - mountPath: /run/k3s/containerd
+              name: k3s-containerd-run
+              readOnly: true
           env:
             - name: POD_NAMESPACE
               valueFrom:
                 fieldRef:
                   fieldPath: metadata.namespace
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
       volumes:
         - name: log-volume
           # mount the nodes logs to the container
           hostPath:
             path: /var/log/pods
             type: Directory
+        - name: journal-volume
+          # mount the systemd journal for host-level log collection
+          hostPath:
+            path: /var/log/journal
+            type: DirectoryOrCreate
+        - name: containerd-run
+          hostPath:
+            path: /run/containerd
+            type: DirectoryOrCreate
+        - name: k3s-containerd-run
+          hostPath:
+            path: /run/k3s/containerd
+            type: DirectoryOrCreate
 ---
 apiVersion: scheduling.k8s.io/v1
 kind: PriorityClass