From 9e3ee1217c8fa13ae03ede0b7408b6870d9e3cbf Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 15:14:22 -0700 Subject: [PATCH 1/7] Add diagnostics for Linux XDP Debug BVT runner crash The Debug+UseXdp Linux BVT job consistently crashes the GitHub Actions runner (~72 min vs ~92 min success). The runner disconnects entirely, so the Test step logs are lost (HTTP 404). Two-pronged approach to capture diagnostics: 1. test.yml: Add a 'System Diagnostics (pre-test)' step that runs BEFORE the Test step. Because this step completes normally, its logs are preserved even when the runner later crashes. Captures baseline memory, disk, kernel version, dmesg, core pattern, and CPU info. 2. test.ps1: For the Linux XDP sudo path: - Print memory/disk/load before and after each test binary - Check dmesg for OOM killer, XDP, BPF, and segfault messages - Start a background resource monitor that logs to a file every 30 seconds under artifacts/xdp_diagnostics/ (uploaded as artifact if the runner survives long enough) - Limit core dumps to 1 GB (ulimit -c 1048576) to prevent cascading crashes from filling the disk - Add process timeout (6000s) as safety net against hangs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/test.yml | 16 ++++++++++++++++ scripts/test.ps1 | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0021146d22..907872332f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -197,6 +197,22 @@ jobs: $ManifestPath = ".\src\manifest\MsQuicEtw.man" wevtutil.exe um $ManifestPath wevtutil.exe im $ManifestPath /rf:$($MsQuicDll) /mf:$($MsQuicDll) + - name: System Diagnostics (pre-test) + if: matrix.vec.plat == 'linux' && matrix.vec.xdp == '-UseXdp' + run: | + echo "=== Memory ===" + free -h + echo "=== Disk ===" + df -h / /tmp + echo "=== Kernel / XDP ===" + uname -r + sudo dmesg -T 2>/dev/null | tail -30 || true + echo "=== Core pattern ===" + cat /proc/sys/kernel/core_pattern || true + echo "=== ulimits ===" + ulimit -a + echo "=== CPU ===" + nproc && cat /proc/loadavg - name: Test if: matrix.vec.os == 'WinServerPrerelease' shell: pwsh diff --git a/scripts/test.ps1 b/scripts/test.ps1 index f164c14639..2af455d1bc 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -375,7 +375,41 @@ for ($iteration = 1; $iteration -le $NumIterations; $iteration++) { foreach ($TestPath in $TestPaths) { if ($IsLinux -and $UseXdp) { $NOFILE = Invoke-Expression "bash -c 'ulimit -n'" - Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && pwsh $RunTest -Path $TestPath $TestArguments"') + $DiagDir = Join-Path $RootDir "artifacts" "xdp_diagnostics" + New-Item -ItemType Directory -Path $DiagDir -Force | Out-Null + $DiagFile = Join-Path $DiagDir "resource_monitor.log" + $BinaryName = Split-Path $TestPath -Leaf + + # Start background resource monitor (writes to file for artifact + # upload since the Test step's console output may be lost if the + # runner crashes). + $MonitorCmd = "while true; do echo `"[\`$(date +%H:%M:%S)] free: \`$(free -m | awk '/Mem:/{print \`$3\"/\"\`$2\"MB\"}')" + + " disk: \`$(df -h / | awk 'NR==2{print \`$3\"/\"\`$2}')" + + " load: \`$(cat /proc/loadavg | cut -d' ' -f1-3)`"" + + " >> $DiagFile; sleep 30; done" + $MonitorPid = $null + try { + $MonitorPid = (Start-Process -FilePath "bash" -ArgumentList "-c", $MonitorCmd -PassThru -NoNewWindow).Id + } catch { + Write-Host "Warning: Could not start resource monitor: $_" + } + + Write-Host ">>> [XDP Diag] Before ${BinaryName}:" + bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" + # Limit core dumps to 1 GB to prevent cascading crashes from + # filling the disk. Use timeout as a safety net. + Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && ulimit -c 1048576 && timeout --signal=KILL --foreground 6000 pwsh $RunTest -Path $TestPath $TestArguments"') + $TestExitCode = $LASTEXITCODE + Write-Host ">>> [XDP Diag] After ${BinaryName} (exit=$TestExitCode):" + bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" + # Check for OOM killer or XDP kernel errors + Write-Host ">>> [XDP Diag] dmesg check:" + bash -c "sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -20 || true" + + # Stop the background monitor + if ($MonitorPid) { + Stop-Process -Id $MonitorPid -ErrorAction SilentlyContinue + } } else { Invoke-Expression ($RunTest + " -Path $TestPath " + $TestArguments) } From fc85c13543cc23fe3c9f214364cb43ad4ce480eb Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 15:36:18 -0700 Subject: [PATCH 2/7] Fix PowerShell parsing error in resource monitor script Use a single-quoted here-string (@'...'@) to write the bash monitor script to a file, avoiding PowerShell's interpretation of $ and / inside the awk commands. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/test.ps1 | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/test.ps1 b/scripts/test.ps1 index 2af455d1bc..74b272cdea 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -383,13 +383,18 @@ for ($iteration = 1; $iteration -le $NumIterations; $iteration++) { # Start background resource monitor (writes to file for artifact # upload since the Test step's console output may be lost if the # runner crashes). - $MonitorCmd = "while true; do echo `"[\`$(date +%H:%M:%S)] free: \`$(free -m | awk '/Mem:/{print \`$3\"/\"\`$2\"MB\"}')" + - " disk: \`$(df -h / | awk 'NR==2{print \`$3\"/\"\`$2}')" + - " load: \`$(cat /proc/loadavg | cut -d' ' -f1-3)`"" + - " >> $DiagFile; sleep 30; done" + $MonitorScript = Join-Path $DiagDir "monitor.sh" + @' +#!/bin/bash +while true; do + echo "[$(date +%H:%M:%S)] mem=$(free -m | awk 'NR==2{print $3"/"$2"MB"}') disk=$(df -h / | awk 'NR==2{print $3"/"$2}') load=$(cut -d' ' -f1-3 /proc/loadavg)" >> "$1" + sleep 30 +done +'@ | Set-Content -Path $MonitorScript -NoNewline + bash -c "chmod +x $MonitorScript" $MonitorPid = $null try { - $MonitorPid = (Start-Process -FilePath "bash" -ArgumentList "-c", $MonitorCmd -PassThru -NoNewWindow).Id + $MonitorPid = (Start-Process -FilePath "bash" -ArgumentList $MonitorScript, $DiagFile -PassThru -NoNewWindow).Id } catch { Write-Host "Warning: Could not start resource monitor: $_" } From a6304e3e853e9562e09a9b6aee290666ee4f1040 Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 17:34:05 -0700 Subject: [PATCH 3/7] Post XDP diagnostics as PR comments to survive runner crash The Test step logs are completely lost (HTTP 404) when the runner crashes, so console diagnostics are useless. Instead, post a PR comment after each test binary with memory, disk, load, dmesg, and resource monitor data. These comments survive because they're sent via GitHub API before the next binary starts. Also disable core dumps entirely via hard limit (ulimit -Hc 0) since the ulimit -c unlimited in run-gtest.ps1 overrides any soft limit. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/test.ps1 | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/scripts/test.ps1 b/scripts/test.ps1 index 74b272cdea..90c1798bef 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -401,15 +401,30 @@ done Write-Host ">>> [XDP Diag] Before ${BinaryName}:" bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" - # Limit core dumps to 1 GB to prevent cascading crashes from - # filling the disk. Use timeout as a safety net. - Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && ulimit -c 1048576 && timeout --signal=KILL --foreground 6000 pwsh $RunTest -Path $TestPath $TestArguments"') + # Disable core dumps entirely (the ulimit -c unlimited in + # run-gtest.ps1 overrides any soft limit set here, so we set + # the hard limit to 0). Use timeout as a safety net. + Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && ulimit -Hc 0 && timeout --signal=KILL --foreground 6000 pwsh $RunTest -Path $TestPath $TestArguments"') $TestExitCode = $LASTEXITCODE Write-Host ">>> [XDP Diag] After ${BinaryName} (exit=$TestExitCode):" - bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" - # Check for OOM killer or XDP kernel errors - Write-Host ">>> [XDP Diag] dmesg check:" - bash -c "sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -20 || true" + $DiagMsg = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -10 || true" + $DiagMsg | ForEach-Object { Write-Host $_ } + + # Post diagnostics as a PR comment so they survive a runner crash. + # GITHUB_TOKEN and GITHUB_REPOSITORY are set by GitHub Actions. + if ($env:GITHUB_TOKEN -and $env:GITHUB_REPOSITORY -and $env:GITHUB_REF -match 'refs/pull/(\d+)') { + $PrNumber = $Matches[1] + $CommentBody = "### XDP Diag checkpoint: ``$BinaryName`` (exit=$TestExitCode)`n``````n$($DiagMsg -join "`n")`n```````n_Resource monitor:_`n``````n$(if (Test-Path $DiagFile) { Get-Content $DiagFile -Raw } else { 'no data' })`n``````" + $JsonBody = @{ body = $CommentBody } | ConvertTo-Json -Compress + try { + Invoke-RestMethod -Uri "https://api.github.com/repos/$($env:GITHUB_REPOSITORY)/issues/$PrNumber/comments" ` + -Method Post -Headers @{ Authorization = "Bearer $($env:GITHUB_TOKEN)"; "Content-Type" = "application/json" } ` + -Body $JsonBody -ErrorAction SilentlyContinue | Out-Null + Write-Host ">>> [XDP Diag] Posted checkpoint to PR #$PrNumber" + } catch { + Write-Host ">>> [XDP Diag] Failed to post PR comment: $_" + } + } # Stop the background monitor if ($MonitorPid) { From 2c448b772542c40fd9e69804504a18e5398235b1 Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 19:01:10 -0700 Subject: [PATCH 4/7] Pass GITHUB_TOKEN to Test step for PR comment diagnostics GITHUB_TOKEN is not automatically available as an environment variable in GitHub Actions steps. Explicitly pass it so test.ps1 can post diagnostic checkpoints as PR comments that survive runner crashes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 907872332f..bf50caccf4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -222,6 +222,8 @@ jobs: if: matrix.vec.os != 'WinServerPrerelease' shell: pwsh timeout-minutes: 120 + env: + GITHUB_TOKEN: ${{ github.token }} run: scripts/test.ps1 -Config ${{ matrix.vec.config }} -Arch ${{ matrix.vec.arch }} -Tls ${{ matrix.vec.tls }} -OsRunner ${{ matrix.vec.os }} -GHA -LogProfile ${{ matrix.vec.log || (inputs.log_level || 'Full.Light') }} -GenerateXmlResults ${{ matrix.vec.xdp }} ${{ matrix.vec.qtip }} ${{ inputs.filter && '-Filter' }} ${{ inputs.filter || '' }} - name: Fix log permissions for Linux XDP if: failure() && matrix.vec.plat == 'linux' # (matrix.vec.plat == 'linux' && matrix.vec.xdp == '-UseXdp') doesn't work for some reason From 7175b4f6137d0962eede0987cbf23af1834699a2 Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 20:31:17 -0700 Subject: [PATCH 5/7] Fix PR comment diagnostics: use curl with temp file, add pull-requests permission - Add pull-requests: write permission to workflow - Rewrite Post-XdpDiag to write JSON to temp file and use curl -d @file to avoid PowerShell escaping issues - Post diagnostic comment BEFORE each binary starts (not just after) so we get data even if the first binary crashes the runner - Remove duplicate/broken curl call Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/test.yml | 1 + scripts/test.ps1 | 56 ++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bf50caccf4..b258de84c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,6 +48,7 @@ concurrency: permissions: contents: read issues: write + pull-requests: write jobs: build-windows-kernel: diff --git a/scripts/test.ps1 b/scripts/test.ps1 index 90c1798bef..ebbeb8d480 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -380,9 +380,29 @@ for ($iteration = 1; $iteration -le $NumIterations; $iteration++) { $DiagFile = Join-Path $DiagDir "resource_monitor.log" $BinaryName = Split-Path $TestPath -Leaf - # Start background resource monitor (writes to file for artifact - # upload since the Test step's console output may be lost if the - # runner crashes). + # Helper: post a diagnostic comment to the PR via GitHub API. + function Post-XdpDiag($Title, $Body) { + if (-not $env:GITHUB_TOKEN -or -not $env:GITHUB_REPOSITORY) { + Write-Host ">>> [XDP Diag] Missing GITHUB_TOKEN or GITHUB_REPOSITORY" + return + } + if (-not ($env:GITHUB_REF -match 'refs/pull/(\d+)')) { + Write-Host ">>> [XDP Diag] Not a PR (REF=$($env:GITHUB_REF))" + return + } + $PrNum = $Matches[1] + $Full = "### XDP Diag: $Title`n$Body" + $TmpFile = Join-Path $DiagDir "comment.json" + @{ body = $Full } | ConvertTo-Json -Depth 2 | Set-Content -Path $TmpFile + $result = bash -c "curl -sS -w '%{http_code}' -X POST -H 'Authorization: Bearer $($env:GITHUB_TOKEN)' -H 'Content-Type: application/json' -d @$TmpFile 'https://api.github.com/repos/$($env:GITHUB_REPOSITORY)/issues/$PrNum/comments' -o /dev/null 2>&1" + Write-Host ">>> [XDP Diag] Post '$Title' to PR #$PrNum -> HTTP $result" + } + + # Post pre-flight diagnostics BEFORE the test binary starts + $PreDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'cores:'; nproc; echo 'kernel:'; uname -r" + Post-XdpDiag "Starting $BinaryName" "``````n$($PreDiag -join "`n")`n``````" + + # Start background resource monitor $MonitorScript = Join-Path $DiagDir "monitor.sh" @' #!/bin/bash @@ -401,30 +421,18 @@ done Write-Host ">>> [XDP Diag] Before ${BinaryName}:" bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" - # Disable core dumps entirely (the ulimit -c unlimited in - # run-gtest.ps1 overrides any soft limit set here, so we set - # the hard limit to 0). Use timeout as a safety net. + # Disable core dumps entirely via hard limit. Use timeout as + # a safety net. Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && ulimit -Hc 0 && timeout --signal=KILL --foreground 6000 pwsh $RunTest -Path $TestPath $TestArguments"') $TestExitCode = $LASTEXITCODE + + # Post post-test diagnostics + $PostDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -10 || echo 'none'" + $MonitorLog = if (Test-Path $DiagFile) { Get-Content $DiagFile -Raw } else { "no data" } + Post-XdpDiag "Finished $BinaryName (exit=$TestExitCode)" "``````n$($PostDiag -join "`n")`n``````n`nResource monitor:`n``````n$MonitorLog`n``````" + Write-Host ">>> [XDP Diag] After ${BinaryName} (exit=$TestExitCode):" - $DiagMsg = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -10 || true" - $DiagMsg | ForEach-Object { Write-Host $_ } - - # Post diagnostics as a PR comment so they survive a runner crash. - # GITHUB_TOKEN and GITHUB_REPOSITORY are set by GitHub Actions. - if ($env:GITHUB_TOKEN -and $env:GITHUB_REPOSITORY -and $env:GITHUB_REF -match 'refs/pull/(\d+)') { - $PrNumber = $Matches[1] - $CommentBody = "### XDP Diag checkpoint: ``$BinaryName`` (exit=$TestExitCode)`n``````n$($DiagMsg -join "`n")`n```````n_Resource monitor:_`n``````n$(if (Test-Path $DiagFile) { Get-Content $DiagFile -Raw } else { 'no data' })`n``````" - $JsonBody = @{ body = $CommentBody } | ConvertTo-Json -Compress - try { - Invoke-RestMethod -Uri "https://api.github.com/repos/$($env:GITHUB_REPOSITORY)/issues/$PrNumber/comments" ` - -Method Post -Headers @{ Authorization = "Bearer $($env:GITHUB_TOKEN)"; "Content-Type" = "application/json" } ` - -Body $JsonBody -ErrorAction SilentlyContinue | Out-Null - Write-Host ">>> [XDP Diag] Posted checkpoint to PR #$PrNumber" - } catch { - Write-Host ">>> [XDP Diag] Failed to post PR comment: $_" - } - } + $PostDiag | ForEach-Object { Write-Host $_ } # Stop the background monitor if ($MonitorPid) { From 1820d2b1796786e60a59e94fad9f9e5a130942f0 Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 22:02:09 -0700 Subject: [PATCH 6/7] Add heartbeat monitor to capture system state every 5 min during test Posts PR comments every 5 minutes with memory, disk, load, dmesg (broadened to catch kernel oops/BUG/panic), top processes by memory, and resource monitor log. This will capture the system state just before the runner crash during msquictest in Debug mode. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/test.ps1 | 83 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/scripts/test.ps1 b/scripts/test.ps1 index ebbeb8d480..6f899aaeef 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -402,7 +402,7 @@ for ($iteration = 1; $iteration -le $NumIterations; $iteration++) { $PreDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'cores:'; nproc; echo 'kernel:'; uname -r" Post-XdpDiag "Starting $BinaryName" "``````n$($PreDiag -join "`n")`n``````" - # Start background resource monitor + # Start background resource monitor that writes to a log file $MonitorScript = Join-Path $DiagDir "monitor.sh" @' #!/bin/bash @@ -419,6 +419,80 @@ done Write-Host "Warning: Could not start resource monitor: $_" } + # Start background heartbeat that posts PR comments every 5 minutes. + # This captures system state periodically so we can see what happens + # just before a runner crash (which destroys all step logs). + $HeartbeatScript = Join-Path $DiagDir "heartbeat.sh" + @" +#!/bin/bash +BINARY_NAME="$BinaryName" +DIAG_DIR="$DiagDir" +DIAG_FILE="$DiagFile" +"@ | Set-Content -Path $HeartbeatScript -NoNewline + @' + +COUNTER=0 +sleep 300 # first heartbeat after 5 minutes +while true; do + COUNTER=$((COUNTER + 1)) + # Collect system state + MEM=$(free -h | head -2) + DISK=$(df -h / | tail -1) + LOAD=$(cat /proc/loadavg) + # Broad dmesg check: kernel oops, BUG, OOM, XDP, segfault, panic, hung_task, slab + DMESG=$(sudo dmesg -T --since '6 minutes ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20) + if [ -z "$DMESG" ]; then + DMESG="(no relevant kernel messages)" + fi + # Get last 10 lines of resource monitor + MONITOR_TAIL="" + if [ -f "$DIAG_FILE" ]; then + MONITOR_TAIL=$(tail -10 "$DIAG_FILE") + fi + # Get process tree for test processes + PROCS=$(ps aux --sort=-%mem | head -15) + # Build the comment body + BODY="### XDP Heartbeat #${COUNTER}: ${BINARY_NAME} (${COUNTER}x5 min elapsed) +\`\`\` +mem: +${MEM} +disk: +${DISK} +load: +${LOAD} +dmesg (last 6 min): +${DMESG} +top processes by memory: +${PROCS} +resource monitor (last 10 entries): +${MONITOR_TAIL} +\`\`\`" + # Post to PR if possible + if [ -n "$GITHUB_TOKEN" ] && [ -n "$GITHUB_REPOSITORY" ] && [ -n "$GITHUB_REF" ]; then + PR_NUM=$(echo "$GITHUB_REF" | grep -oP 'refs/pull/\K\d+') + if [ -n "$PR_NUM" ]; then + TMPFILE="${DIAG_DIR}/heartbeat_comment.json" + python3 -c "import json,sys; print(json.dumps({'body': sys.stdin.read()}))" <<< "$BODY" > "$TMPFILE" + curl -sS -w '%{http_code}' -X POST \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d @"$TMPFILE" \ + "https://api.github.com/repos/$GITHUB_REPOSITORY/issues/$PR_NUM/comments" \ + -o /dev/null 2>&1 + fi + fi + sleep 300 # every 5 minutes +done +'@ | Add-Content -Path $HeartbeatScript -NoNewline + bash -c "chmod +x $HeartbeatScript" + $HeartbeatPid = $null + try { + $HeartbeatPid = (Start-Process -FilePath "bash" -ArgumentList $HeartbeatScript -PassThru -NoNewWindow).Id + Write-Host ">>> [XDP Diag] Heartbeat monitor started (PID=$HeartbeatPid)" + } catch { + Write-Host "Warning: Could not start heartbeat monitor: $_" + } + Write-Host ">>> [XDP Diag] Before ${BinaryName}:" bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" # Disable core dumps entirely via hard limit. Use timeout as @@ -427,17 +501,20 @@ done $TestExitCode = $LASTEXITCODE # Post post-test diagnostics - $PostDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|killed process|xdp|bpf|out of memory|segfault' | tail -10 || echo 'none'" + $PostDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20 || echo 'none'" $MonitorLog = if (Test-Path $DiagFile) { Get-Content $DiagFile -Raw } else { "no data" } Post-XdpDiag "Finished $BinaryName (exit=$TestExitCode)" "``````n$($PostDiag -join "`n")`n``````n`nResource monitor:`n``````n$MonitorLog`n``````" Write-Host ">>> [XDP Diag] After ${BinaryName} (exit=$TestExitCode):" $PostDiag | ForEach-Object { Write-Host $_ } - # Stop the background monitor + # Stop the background monitors if ($MonitorPid) { Stop-Process -Id $MonitorPid -ErrorAction SilentlyContinue } + if ($HeartbeatPid) { + Stop-Process -Id $HeartbeatPid -ErrorAction SilentlyContinue + } } else { Invoke-Expression ($RunTest + " -Path $TestPath " + $TestArguments) } From 21c05e466d88b341374436efed609a988ecd348d Mon Sep 17 00:00:00 2001 From: Guillaume Hetier Date: Fri, 3 Apr 2026 23:37:25 -0700 Subject: [PATCH 7/7] Increase heartbeat frequency to 60s, only for msquictest The Debug runner crashes within the first 5 minutes of msquictest. Previous 5-min heartbeat interval was too slow to capture any data before the crash. Now posting every 60 seconds, with no initial delay. Only runs for msquictest to avoid PR comment spam. Also added kernel memory details (slab, VmallocUsed, etc.) to catch kernel-level memory exhaustion not visible in userspace free. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/test.ps1 | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/scripts/test.ps1 b/scripts/test.ps1 index 6f899aaeef..62e4608d2a 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -419,9 +419,11 @@ done Write-Host "Warning: Could not start resource monitor: $_" } - # Start background heartbeat that posts PR comments every 5 minutes. - # This captures system state periodically so we can see what happens - # just before a runner crash (which destroys all step logs). + # Start background heartbeat only for msquictest (where the crash + # happens). Posts PR comments every 60 seconds so we capture the + # system state just before the runner crash. + $HeartbeatPid = $null + if ($BinaryName -eq "msquictest") { $HeartbeatScript = Join-Path $DiagDir "heartbeat.sh" @" #!/bin/bash @@ -432,7 +434,6 @@ DIAG_FILE="$DiagFile" @' COUNTER=0 -sleep 300 # first heartbeat after 5 minutes while true; do COUNTER=$((COUNTER + 1)) # Collect system state @@ -440,31 +441,35 @@ while true; do DISK=$(df -h / | tail -1) LOAD=$(cat /proc/loadavg) # Broad dmesg check: kernel oops, BUG, OOM, XDP, segfault, panic, hung_task, slab - DMESG=$(sudo dmesg -T --since '6 minutes ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20) + DMESG=$(sudo dmesg -T --since '2 minutes ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20) if [ -z "$DMESG" ]; then DMESG="(no relevant kernel messages)" fi - # Get last 10 lines of resource monitor + # Get last 5 lines of resource monitor MONITOR_TAIL="" if [ -f "$DIAG_FILE" ]; then - MONITOR_TAIL=$(tail -10 "$DIAG_FILE") + MONITOR_TAIL=$(tail -5 "$DIAG_FILE") fi - # Get process tree for test processes - PROCS=$(ps aux --sort=-%mem | head -15) + # Get process tree for test processes (top 10 by memory) + PROCS=$(ps aux --sort=-%mem | head -10) + # Check kernel memory (slab + page cache details) + KMEM=$(cat /proc/meminfo | grep -E 'Slab|SReclaimable|SUnreclaim|Committed_AS|VmallocUsed|AnonPages|Mapped|PageTables') # Build the comment body - BODY="### XDP Heartbeat #${COUNTER}: ${BINARY_NAME} (${COUNTER}x5 min elapsed) + BODY="### XDP Heartbeat #${COUNTER}: ${BINARY_NAME} (+${COUNTER} min) \`\`\` mem: ${MEM} +kernel mem: +${KMEM} disk: ${DISK} load: ${LOAD} -dmesg (last 6 min): +dmesg (last 2 min): ${DMESG} top processes by memory: ${PROCS} -resource monitor (last 10 entries): +resource monitor: ${MONITOR_TAIL} \`\`\`" # Post to PR if possible @@ -481,17 +486,17 @@ ${MONITOR_TAIL} -o /dev/null 2>&1 fi fi - sleep 300 # every 5 minutes + sleep 60 # heartbeat every 60 seconds done '@ | Add-Content -Path $HeartbeatScript -NoNewline bash -c "chmod +x $HeartbeatScript" - $HeartbeatPid = $null try { $HeartbeatPid = (Start-Process -FilePath "bash" -ArgumentList $HeartbeatScript -PassThru -NoNewWindow).Id - Write-Host ">>> [XDP Diag] Heartbeat monitor started (PID=$HeartbeatPid)" + Write-Host ">>> [XDP Diag] Heartbeat monitor started (PID=$HeartbeatPid) for $BinaryName" } catch { Write-Host "Warning: Could not start heartbeat monitor: $_" } + } # end if msquictest Write-Host ">>> [XDP Diag] Before ${BinaryName}:" bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg"