diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index e2fdae29846..749ff5fbad4 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -96,8 +96,6 @@ jobs: just get-credentials - name: E2E Test run: | - nix build ".#${SET}.scripts.get-logs" - nix run ".#${SET}.scripts.get-logs" start workspace/just.namespace & just e2e "${TEST_NAME}" - name: Check for skipped id: skipped @@ -111,7 +109,7 @@ jobs: - name: Download logs if: always() && steps.skipped.outputs.skipped == 'false' run: | - nix run ".#${SET}.scripts.get-logs" download workspace/just.namespace + just download-logs - name: Upload logs if: always() && steps.skipped.outputs.skipped == 'false' uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 diff --git a/dev-docs/e2e/debugging.md b/dev-docs/e2e/debugging.md new file mode 100644 index 00000000000..be2f61de218 --- /dev/null +++ b/dev-docs/e2e/debugging.md @@ -0,0 +1,109 @@ +# Debugging e2e failures + +## Collecting logs + +### After a `just e2e` run + +`just e2e` deploys a log-collector DaemonSet that streams pod logs from the +start. After the test finishes (pass or fail), download the logs: + +```bash +just download-logs +``` + +Logs are written to `workspace/logs/`. + +### After a manual deployment (`just`) + +If you deployed with `just` (the default target) and want to collect logs: + +```bash +just download-logs +``` + +This deploys the log-collector DaemonSet (if not already running), collects +host-level journal entries, and downloads everything. + +### In CI + +CI runs `just download-logs` automatically after every e2e test. Logs are +uploaded as GitHub Actions artifacts. To find them: go to the workflow run, +scroll to the bottom of the run summary page, and look for artifacts named +`e2e_pod_logs--` (for example, `e2e_pod_logs-Metal-QEMU-SNP-openssl`). +Alternatively you can expand the "Upload logs" step in a particular test and +get the Artifact download URL. + +## Log structure + +``` +workspace/logs/ +├── __/ # pod container logs +│ └── /0.log +├── host// # host-level journal logs (per node) +│ ├── kernel.log # journalctl -k (SEV-ES termination, VFIO/IOMMU) +│ ├── k3s.log # journalctl -u k3s (k3s-specific kubelet/containerd) +│ ├── kubelet.log # journalctl -u kubelet (non-k3s runners) +│ ├── containerd.log # journalctl -u containerd (non-k3s runners) +│ └── kata.log # journalctl -t kata (QEMU lifecycle, register dumps) +├── metadata// +│ └── sandbox-map.txt # CVM pod name -> kata sandbox ID +└── -k8s-events.yaml # kubernetes events +``` + +Host logs are time-scoped to the namespace creation time, so they only contain +entries relevant to the test run. + +## Debugging CVM failures + +CVM boot failures (for example, SEV-ES termination, OVMF crashes) leave no trace in +pod logs -the guest never starts. Look at host-level logs instead: + +1. **kernel.log** -look for `SEV-ES guest requested termination`, VFIO/IOMMU + errors, or KVM failures. +2. **kata.log** -look for `detected guest crash`, QEMU launch arguments, + register dumps, and console output (`vmconsole=` lines contain guest serial + output). +3. **k3s.log** -look for `task is in unknown state` or containerd errors that + indicate the CVM process died. + +## Tracing a pod to its sandbox in kata.log + +kata.log contains interleaved logs from all sandboxes. The collected metadata +file (`metadata/sandbox-map.txt`) maps CVM pod names to kata sandbox IDs. +The sandbox map only includes pods that are still running at log collection time. +Pods that might have been deleted earlier in the test (one such example is the +regression test which creates and tears down multiple rounds of pods) won't have entries. + +1. Find the sandbox ID for a pod: + +```bash +cat workspace/logs/metadata/*/sandbox-map.txt +# coordinator-0 f4bb878b2e58bd3bd5a89fe2bc99b7368fc6aa070a0b8490a5c69a7c9816be65 +# openssl-backend-757688b785-dvr4c 3658285f5581ad51... +# openssl-frontend-575dfdbb89-srwvr 828d8660496f6ac4... +``` + +2. Filter kata.log for a specific pod's sandbox: + +```bash +sandbox=$(grep coordinator workspace/logs/metadata/*/sandbox-map.txt | awk '{print $2}') +grep "$sandbox" workspace/logs/host/kata.log +``` + +### Fallback: Finding sandboxes by runtime class hash + +If a pod is missing from the sandbox map (deleted before log collection), you +can find its sandbox ID using the runtime class hash from kata.log. The hash +is the last component of the runtime class name (for example, `d17bc85e` from +`contrast-cc-metal-qemu-snp-d17bc85e`): + +```bash +grep "d17bc85e" workspace/logs/host/*/kata.log | grep -oP 'sandbox=\K[a-f0-9]+' | sort -u +``` + +This lists all sandbox IDs for that runtime class. Cross-reference with the +sandbox map to identify which ones are unmapped. + +Note that some kata log lines (config loading, factory init, device cold plug) +don't have a sandbox ID. These are shared across all CVMs and may be relevant +for debugging startup failures. diff --git a/justfile b/justfile index ff226c5ed75..ef9d099fdcd 100644 --- a/justfile +++ b/justfile @@ -31,6 +31,21 @@ debugshell: (push "debugshell") k8s-log-collector: (push "k8s-log-collector") +# Download all logs (pod logs + host journal). Deploys the log-collector if not already running. +download-logs set=default_set: + #!/usr/bin/env bash + set -euo pipefail + # Only push if not already pushed (e.g. by _e2e). + if ! grep -q "k8s-log-collector" "{{ workspace_dir }}/just.containerlookup" 2>/dev/null; then + just k8s-log-collector + fi + namespace_file="{{ workspace_dir }}/just.namespace" + if [[ ! -f "$namespace_file" ]]; then + echo "No namespace file found at $namespace_file. Deploy something first." >&2 + exit 1 + fi + nix run .#{{ set }}.scripts.get-logs -- download "$namespace_file" + containerd-reproducer set=default_set: #!/usr/bin/env bash set -euo pipefail @@ -80,7 +95,7 @@ e2e target=default_deploy_target platform=default_platform set=default_set: echo "Using set=$RESOLVED_SET for test '{{ target }}'" set="$RESOLVED_SET" just _e2e {{ target }} {{ platform }} -_e2e target=default_deploy_target platform=default_platform set=default_set: soft-clean coordinator initializer openssl port-forwarder service-mesh-proxy memdump debugshell (node-installer platform) +_e2e target=default_deploy_target platform=default_platform set=default_set: soft-clean coordinator initializer openssl port-forwarder service-mesh-proxy memdump debugshell k8s-log-collector (node-installer platform) #!/usr/bin/env bash set -euo pipefail if [[ {{ platform }} == "Metal-QEMU-SNP-GPU" || {{ platform }} == "Metal-QEMU-TDX-GPU" ]] ; then @@ -97,6 +112,10 @@ _e2e target=default_deploy_target platform=default_platform set=default_set: sof if [[ {{ target }} == "containerd-11644-reproducer" ]]; then just containerd-reproducer fi + get_logs=$(nix build .#{{ set }}.scripts.get-logs --no-link --print-out-paths) + "$get_logs/bin/get-logs" start ./{{ workspace_dir }}/just.namespace & + get_logs_pid=$! + trap 'kill $get_logs_pid || true' EXIT nix shell .#{{ set }}.contrast.e2e --command {{ target }}.test -test.v \ --image-replacements ./{{ workspace_dir }}/just.containerlookup \ --namespace-file ./{{ workspace_dir }}/just.namespace \ diff --git a/packages/by-name/k8s-log-collector/collect-host-logs.sh b/packages/by-name/k8s-log-collector/collect-host-logs.sh new file mode 100644 index 00000000000..ceb3f8ab5a4 --- /dev/null +++ b/packages/by-name/k8s-log-collector/collect-host-logs.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright 2026 Edgeless Systems GmbH +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +since="${1:?usage: collect-host-logs }" +node="${NODE_NAME:?NODE_NAME must be set}" +mkdir -p "/export/logs/host/$node" +echo "Collecting kernel logs (since $since)..." >&2 +journalctl --directory=/journal -k --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kernel.log || rm -f /export/logs/host/"$node"/kernel.log +echo "Collecting k3s logs (since $since)..." >&2 +journalctl --directory=/journal -u k3s --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/k3s.log || rm -f /export/logs/host/"$node"/k3s.log +echo "Collecting kubelet logs (since $since)..." >&2 +journalctl --directory=/journal -u kubelet --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kubelet.log || rm -f /export/logs/host/"$node"/kubelet.log +echo "Collecting containerd logs (since $since)..." >&2 +journalctl --directory=/journal -u containerd --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/containerd.log || rm -f /export/logs/host/"$node"/containerd.log +echo "Collecting kata logs (since $since)..." >&2 +journalctl --directory=/journal -t kata --since="$since" --no-pager 2>/dev/null >/export/logs/host/"$node"/kata.log || rm -f /export/logs/host/"$node"/kata.log +echo "Collecting pod-sandbox metadata..." >&2 +mkdir -p "/export/logs/metadata/$node" +for sock in /run/k3s/containerd/containerd.sock /run/containerd/containerd.sock; do + if [[ -S $sock ]]; then + CONTAINER_RUNTIME_ENDPOINT="unix://$sock" crictl pods -o json 2>/dev/null | + jq -r --arg ns "${POD_NAMESPACE:-}" \ + '.items[] | select(.metadata.namespace == $ns and .runtimeHandler != "" and .runtimeHandler != null) | "\(.metadata.name)\t\(.id)"' \ + >"/export/logs/metadata/$node/sandbox-map.txt" + break + fi +done +echo "Host log collection complete." >&2 diff --git a/packages/by-name/k8s-log-collector/script.sh b/packages/by-name/k8s-log-collector/collect-pod-logs.sh similarity index 100% rename from packages/by-name/k8s-log-collector/script.sh rename to packages/by-name/k8s-log-collector/collect-pod-logs.sh diff --git a/packages/by-name/k8s-log-collector/package.nix b/packages/by-name/k8s-log-collector/package.nix index 038ca0479fa..653c47f3701 100644 --- a/packages/by-name/k8s-log-collector/package.nix +++ b/packages/by-name/k8s-log-collector/package.nix @@ -3,21 +3,50 @@ { writeShellApplication, + symlinkJoin, inotify-tools, coreutils, findutils, gnused, gnugrep, + systemdMinimal, + cri-tools, + jq, }: -writeShellApplication { - name = "collect-logs"; - runtimeInputs = [ - inotify-tools - coreutils - findutils - gnugrep - gnused +let + collect-pod-logs = writeShellApplication { + name = "collect-pod-logs"; + runtimeInputs = [ + inotify-tools + coreutils + findutils + gnugrep + gnused + ]; + text = builtins.readFile ./collect-pod-logs.sh; + }; + + # systemdMinimal disables all compression by default, but we need it + # to read host journal files that may be compressed with LZ4/ZSTD. + systemdWithJournal = systemdMinimal.override { withCompression = true; }; + + collect-host-logs = writeShellApplication { + name = "collect-host-logs"; + runtimeInputs = [ + coreutils + systemdWithJournal + cri-tools + jq + ]; + text = builtins.readFile ./collect-host-logs.sh; + }; +in + +symlinkJoin { + name = "k8s-log-collector"; + paths = [ + collect-pod-logs + collect-host-logs ]; - text = builtins.readFile ./script.sh; } diff --git a/packages/by-name/scripts/get-logs/get-logs.sh b/packages/by-name/scripts/get-logs/get-logs.sh index ab015e86bb7..df6a4fd1b80 100644 --- a/packages/by-name/scripts/get-logs/get-logs.sh +++ b/packages/by-name/scripts/get-logs/get-logs.sh @@ -33,9 +33,7 @@ deploy_collectors() { trap cleanup INT TERM EXIT tail -n +1 -f "$namespace_file" | while IFS= read -r namespace; do - cp ./packages/log-collector.yaml ./workspace/log-collector.yaml - echo "Starting log collector in namespace $namespace" >&2 - retry kubectl apply -n "$namespace" -f ./workspace/log-collector.yaml + deploy_to_namespace "$namespace" done } @@ -50,6 +48,18 @@ kill_deploy_collectors() { wait "$deploy_pid" 2>/dev/null || true } +deploy_to_namespace() { + local namespace="$1" + cp ./packages/log-collector.yaml ./workspace/log-collector.yaml + replacement=$(grep "k8s-log-collector:latest=" ./workspace/just.containerlookup 2>/dev/null | tail -1 | cut -d= -f2- || true) + if [[ -n $replacement ]]; then + echo "Using pushed log-collector image: $replacement" >&2 + sed -i "s|image: .*k8s-log-collector.*|image: \"$replacement\"|" ./workspace/log-collector.yaml + fi + echo "Starting log collector in namespace $namespace" >&2 + retry kubectl apply -n "$namespace" -f ./workspace/log-collector.yaml +} + if [[ $# -lt 2 ]]; then echo "Usage: get-logs [start | download] namespaceFile" exit 1 @@ -73,7 +83,14 @@ download) mkdir -p "./workspace/logs" log_pods_missing=false while read -r namespace; do + start_time=$(kubectl get ns "$namespace" -o jsonpath='{.metadata.creationTimestamp}') + pods="$(kubectl get pods -o name -n "$namespace" | grep log-collector | cut -c 5- || true)" + if [[ -z $pods ]]; then + deploy_to_namespace "$namespace" + kubectl rollout status daemonset/log-collector -n "$namespace" --timeout=60s + pods="$(kubectl get pods -o name -n "$namespace" | grep log-collector | cut -c 5- || true)" + fi if [[ -z $pods ]]; then echo "No log-collector pods found in namespace $namespace" >&2 log_pods_missing=true @@ -83,12 +100,23 @@ download) echo "Collecting logs from namespace $namespace, pod $pod" >&2 retry kubectl wait --for=condition=Ready -n "$namespace" "pod/$pod" echo "Pod $pod is ready" >&2 - retry kubectl exec -n "$namespace" "$pod" -- /bin/bash -c "rm -f /exported-logs.tar.gz; cp -r /export /export-no-stream; tar zcvf /exported-logs.tar.gz /export-no-stream; rm -rf /export-no-stream" + + echo "Collecting host-level logs (since $start_time)..." >&2 + retry kubectl exec -n "$namespace" "$pod" -- \ + collect-host-logs "$start_time" || true + + retry kubectl exec -n "$namespace" "$pod" -- /bin/bash -c ' + rm -f /exported-logs.tar.gz + cp -r /export /export-no-stream + find /export-no-stream -empty -delete + tar zcvf /exported-logs.tar.gz /export-no-stream + rm -rf /export-no-stream + ' retry kubectl cp -n "$namespace" "$pod:/exported-logs.tar.gz" ./workspace/logs/exported-logs.tar.gz echo "Downloaded logs tarball for namespace $namespace, pod $pod, extracting..." >&2 tar xzvf ./workspace/logs/exported-logs.tar.gz --directory "./workspace/logs" rm ./workspace/logs/exported-logs.tar.gz - mv ./workspace/logs/export-no-stream/logs/* ./workspace/logs/ + cp -rn ./workspace/logs/export-no-stream/logs/* ./workspace/logs/ rm -rf ./workspace/logs/export-no-stream done echo "Collecting Kubernetes events for namespace $namespace" >&2 diff --git a/packages/containers.nix b/packages/containers.nix index 02ca6b407cc..7e11a6cb8d3 100644 --- a/packages/containers.nix +++ b/packages/containers.nix @@ -2,7 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 { - lib, pkgs, contrastPkgs, dockerTools, @@ -165,9 +164,10 @@ coreutils gnutar gzip + contrastPkgs.k8s-log-collector ]; config = { - Cmd = [ "${lib.getExe contrastPkgs.k8s-log-collector}" ]; + Cmd = [ "${contrastPkgs.k8s-log-collector}/bin/collect-pod-logs" ]; Volumes."/logs" = { }; }; }; diff --git a/packages/log-collector.yaml b/packages/log-collector.yaml index c41c535ca21..3c6c756fa10 100644 --- a/packages/log-collector.yaml +++ b/packages/log-collector.yaml @@ -29,17 +29,43 @@ spec: - mountPath: /logs name: log-volume readOnly: true + - mountPath: /journal + name: journal-volume + readOnly: true + - mountPath: /run/containerd + name: containerd-run + readOnly: true + - mountPath: /run/k3s/containerd + name: k3s-containerd-run + readOnly: true env: - name: POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumes: - name: log-volume # mount the nodes logs to the container hostPath: path: /var/log/pods type: Directory + - name: journal-volume + # mount the systemd journal for host-level log collection + hostPath: + path: /var/log/journal + type: DirectoryOrCreate + - name: containerd-run + hostPath: + path: /run/containerd + type: DirectoryOrCreate + - name: k3s-containerd-run + hostPath: + path: /run/k3s/containerd + type: DirectoryOrCreate --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass