diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0021146d22..b258de84c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,6 +48,7 @@ concurrency: permissions: contents: read issues: write + pull-requests: write jobs: build-windows-kernel: @@ -197,6 +198,22 @@ jobs: $ManifestPath = ".\src\manifest\MsQuicEtw.man" wevtutil.exe um $ManifestPath wevtutil.exe im $ManifestPath /rf:$($MsQuicDll) /mf:$($MsQuicDll) + - name: System Diagnostics (pre-test) + if: matrix.vec.plat == 'linux' && matrix.vec.xdp == '-UseXdp' + run: | + echo "=== Memory ===" + free -h + echo "=== Disk ===" + df -h / /tmp + echo "=== Kernel / XDP ===" + uname -r + sudo dmesg -T 2>/dev/null | tail -30 || true + echo "=== Core pattern ===" + cat /proc/sys/kernel/core_pattern || true + echo "=== ulimits ===" + ulimit -a + echo "=== CPU ===" + nproc && cat /proc/loadavg - name: Test if: matrix.vec.os == 'WinServerPrerelease' shell: pwsh @@ -206,6 +223,8 @@ jobs: if: matrix.vec.os != 'WinServerPrerelease' shell: pwsh timeout-minutes: 120 + env: + GITHUB_TOKEN: ${{ github.token }} run: scripts/test.ps1 -Config ${{ matrix.vec.config }} -Arch ${{ matrix.vec.arch }} -Tls ${{ matrix.vec.tls }} -OsRunner ${{ matrix.vec.os }} -GHA -LogProfile ${{ matrix.vec.log || (inputs.log_level || 'Full.Light') }} -GenerateXmlResults ${{ matrix.vec.xdp }} ${{ matrix.vec.qtip }} ${{ inputs.filter && '-Filter' }} ${{ inputs.filter || '' }} - name: Fix log permissions for Linux XDP if: failure() && matrix.vec.plat == 'linux' # (matrix.vec.plat == 'linux' && matrix.vec.xdp == '-UseXdp') doesn't work for some reason diff --git a/scripts/test.ps1 b/scripts/test.ps1 index f164c14639..62e4608d2a 100644 --- a/scripts/test.ps1 +++ b/scripts/test.ps1 @@ -375,7 +375,151 @@ for ($iteration = 1; $iteration -le $NumIterations; $iteration++) { foreach ($TestPath in $TestPaths) { if ($IsLinux -and $UseXdp) { $NOFILE = Invoke-Expression "bash -c 'ulimit -n'" - Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && pwsh $RunTest -Path $TestPath $TestArguments"') + $DiagDir = Join-Path $RootDir "artifacts" "xdp_diagnostics" + New-Item -ItemType Directory -Path $DiagDir -Force | Out-Null + $DiagFile = Join-Path $DiagDir "resource_monitor.log" + $BinaryName = Split-Path $TestPath -Leaf + + # Helper: post a diagnostic comment to the PR via GitHub API. + function Post-XdpDiag($Title, $Body) { + if (-not $env:GITHUB_TOKEN -or -not $env:GITHUB_REPOSITORY) { + Write-Host ">>> [XDP Diag] Missing GITHUB_TOKEN or GITHUB_REPOSITORY" + return + } + if (-not ($env:GITHUB_REF -match 'refs/pull/(\d+)')) { + Write-Host ">>> [XDP Diag] Not a PR (REF=$($env:GITHUB_REF))" + return + } + $PrNum = $Matches[1] + $Full = "### XDP Diag: $Title`n$Body" + $TmpFile = Join-Path $DiagDir "comment.json" + @{ body = $Full } | ConvertTo-Json -Depth 2 | Set-Content -Path $TmpFile + $result = bash -c "curl -sS -w '%{http_code}' -X POST -H 'Authorization: Bearer $($env:GITHUB_TOKEN)' -H 'Content-Type: application/json' -d @$TmpFile 'https://api.github.com/repos/$($env:GITHUB_REPOSITORY)/issues/$PrNum/comments' -o /dev/null 2>&1" + Write-Host ">>> [XDP Diag] Post '$Title' to PR #$PrNum -> HTTP $result" + } + + # Post pre-flight diagnostics BEFORE the test binary starts + $PreDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'cores:'; nproc; echo 'kernel:'; uname -r" + Post-XdpDiag "Starting $BinaryName" "``````n$($PreDiag -join "`n")`n``````" + + # Start background resource monitor that writes to a log file + $MonitorScript = Join-Path $DiagDir "monitor.sh" + @' +#!/bin/bash +while true; do + echo "[$(date +%H:%M:%S)] mem=$(free -m | awk 'NR==2{print $3"/"$2"MB"}') disk=$(df -h / | awk 'NR==2{print $3"/"$2}') load=$(cut -d' ' -f1-3 /proc/loadavg)" >> "$1" + sleep 30 +done +'@ | Set-Content -Path $MonitorScript -NoNewline + bash -c "chmod +x $MonitorScript" + $MonitorPid = $null + try { + $MonitorPid = (Start-Process -FilePath "bash" -ArgumentList $MonitorScript, $DiagFile -PassThru -NoNewWindow).Id + } catch { + Write-Host "Warning: Could not start resource monitor: $_" + } + + # Start background heartbeat only for msquictest (where the crash + # happens). Posts PR comments every 60 seconds so we capture the + # system state just before the runner crash. + $HeartbeatPid = $null + if ($BinaryName -eq "msquictest") { + $HeartbeatScript = Join-Path $DiagDir "heartbeat.sh" + @" +#!/bin/bash +BINARY_NAME="$BinaryName" +DIAG_DIR="$DiagDir" +DIAG_FILE="$DiagFile" +"@ | Set-Content -Path $HeartbeatScript -NoNewline + @' + +COUNTER=0 +while true; do + COUNTER=$((COUNTER + 1)) + # Collect system state + MEM=$(free -h | head -2) + DISK=$(df -h / | tail -1) + LOAD=$(cat /proc/loadavg) + # Broad dmesg check: kernel oops, BUG, OOM, XDP, segfault, panic, hung_task, slab + DMESG=$(sudo dmesg -T --since '2 minutes ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20) + if [ -z "$DMESG" ]; then + DMESG="(no relevant kernel messages)" + fi + # Get last 5 lines of resource monitor + MONITOR_TAIL="" + if [ -f "$DIAG_FILE" ]; then + MONITOR_TAIL=$(tail -5 "$DIAG_FILE") + fi + # Get process tree for test processes (top 10 by memory) + PROCS=$(ps aux --sort=-%mem | head -10) + # Check kernel memory (slab + page cache details) + KMEM=$(cat /proc/meminfo | grep -E 'Slab|SReclaimable|SUnreclaim|Committed_AS|VmallocUsed|AnonPages|Mapped|PageTables') + # Build the comment body + BODY="### XDP Heartbeat #${COUNTER}: ${BINARY_NAME} (+${COUNTER} min) +\`\`\` +mem: +${MEM} +kernel mem: +${KMEM} +disk: +${DISK} +load: +${LOAD} +dmesg (last 2 min): +${DMESG} +top processes by memory: +${PROCS} +resource monitor: +${MONITOR_TAIL} +\`\`\`" + # Post to PR if possible + if [ -n "$GITHUB_TOKEN" ] && [ -n "$GITHUB_REPOSITORY" ] && [ -n "$GITHUB_REF" ]; then + PR_NUM=$(echo "$GITHUB_REF" | grep -oP 'refs/pull/\K\d+') + if [ -n "$PR_NUM" ]; then + TMPFILE="${DIAG_DIR}/heartbeat_comment.json" + python3 -c "import json,sys; print(json.dumps({'body': sys.stdin.read()}))" <<< "$BODY" > "$TMPFILE" + curl -sS -w '%{http_code}' -X POST \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d @"$TMPFILE" \ + "https://api.github.com/repos/$GITHUB_REPOSITORY/issues/$PR_NUM/comments" \ + -o /dev/null 2>&1 + fi + fi + sleep 60 # heartbeat every 60 seconds +done +'@ | Add-Content -Path $HeartbeatScript -NoNewline + bash -c "chmod +x $HeartbeatScript" + try { + $HeartbeatPid = (Start-Process -FilePath "bash" -ArgumentList $HeartbeatScript -PassThru -NoNewWindow).Id + Write-Host ">>> [XDP Diag] Heartbeat monitor started (PID=$HeartbeatPid) for $BinaryName" + } catch { + Write-Host "Warning: Could not start heartbeat monitor: $_" + } + } # end if msquictest + + Write-Host ">>> [XDP Diag] Before ${BinaryName}:" + bash -c "free -h; echo '---'; df -h / /tmp; echo '---'; cat /proc/loadavg" + # Disable core dumps entirely via hard limit. Use timeout as + # a safety net. + Invoke-Expression ('/usr/bin/sudo bash -c "ulimit -n $NOFILE && ulimit -Hc 0 && timeout --signal=KILL --foreground 6000 pwsh $RunTest -Path $TestPath $TestArguments"') + $TestExitCode = $LASTEXITCODE + + # Post post-test diagnostics + $PostDiag = bash -c "echo 'mem:'; free -h | head -2; echo 'disk:'; df -h / | tail -1; echo 'load:'; cat /proc/loadavg; echo 'dmesg:'; sudo dmesg -T --since '2 hours ago' 2>/dev/null | grep -iE 'oom|kill|xdp|bpf|segfault|oops|BUG|panic|Call Trace|RIP:|WARNING|hung_task|page allocation|slab|out of memory' | tail -20 || echo 'none'" + $MonitorLog = if (Test-Path $DiagFile) { Get-Content $DiagFile -Raw } else { "no data" } + Post-XdpDiag "Finished $BinaryName (exit=$TestExitCode)" "``````n$($PostDiag -join "`n")`n``````n`nResource monitor:`n``````n$MonitorLog`n``````" + + Write-Host ">>> [XDP Diag] After ${BinaryName} (exit=$TestExitCode):" + $PostDiag | ForEach-Object { Write-Host $_ } + + # Stop the background monitors + if ($MonitorPid) { + Stop-Process -Id $MonitorPid -ErrorAction SilentlyContinue + } + if ($HeartbeatPid) { + Stop-Process -Id $HeartbeatPid -ErrorAction SilentlyContinue + } } else { Invoke-Expression ($RunTest + " -Path $TestPath " + $TestArguments) }