Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions build_tools/github_actions/amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,9 @@
},
"gfx1150": {
"linux": {
# TODO(#3199): Re-enable machine once it is stable
# Label is "linux-gfx1150-gpu-rocm"
"test-runs-on": "",
"test-runs-on": "linux-gfx1150-gpu-rocm",
"family": "gfx1150",
"fetch-gfx-targets": [],
"fetch-gfx-targets": ["gfx1150"],
"build_variants": ["release"],
"sanity_check_only_for_family": True,
},
Expand Down
4 changes: 2 additions & 2 deletions build_tools/github_actions/fetch_test_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _get_script_path(script_name: str) -> str:
"sanity": {
"job_name": "sanity",
"fetch_artifact_args": "--base-only",
"timeout_minutes": 5,
"timeout_minutes": 15,
"test_script": f"python {_get_script_path('test_sanity.py')}",
"platform": ["linux", "windows"],
"total_shards_dict": {
Expand All @@ -58,7 +58,7 @@ def _get_script_path(script_name: str) -> str:
},
# Running docker with cap-add and -v /lib/modules, by recommendation of GitHub:
# https://rocm.docs.amd.com/projects/amdsmi/en/amd-staging/how-to/setup-docker-container.html
"container_options": "--cap-add SYS_MODULE -v /lib/modules:/lib/modules",
"container_options": "--cap-add SYS_MODULE --cap-add SYS_PTRACE -v /lib/modules:/lib/modules",
},
# hip-tests
"hip-tests": {
Expand Down
5 changes: 2 additions & 3 deletions build_tools/github_actions/new_amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,11 @@
"build_variants": ["release"],
},
"test": {
# TODO(#3199): Re-enable machine once it is stable
"run_tests": False,
"run_tests": True,
"runs_on": {
"test": "linux-gfx1150-gpu-rocm",
},
"fetch-gfx-targets": [],
"fetch-gfx-targets": ["gfx1150"],
"sanity_check_only_for_family": True,
},
"release": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
# Enable verbose ROCm logging, see
# https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html
# Note: ROCM_KPACK_DEBUG is set for all components by test_component.yml.
env["AMD_LOG_LEVEL"] = "4"
env["AMD_LOG_LEVEL"] = "7"
# Extra HIP/HSA diagnostics for debugging gfx1150 hang (TheRock#3199)
env["HIP_TRACE_API"] = "1"
env["HIP_LAUNCH_BLOCKING"] = "1"
env["AMD_SERIALIZE_KERNEL"] = "3"
env["AMD_SERIALIZE_COPY"] = "3"
env["HSA_ENABLE_SDMA"] = "0"

# The sanity checks run tools like 'offload-arch' which may search for DLLs on
# multiple search paths (PATH, CWD, system32, etc.).
Expand Down
8 changes: 8 additions & 0 deletions build_tools/print_driver_gpu_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ def run_sanity(os_name: str) -> None:
args=["-r"],
extra_command_search_paths=[bin_dir],
)
# Print per-component firmware versions (useful for debugging hangs)
if AMDGPU_FAMILIES not in unsupported_amdsmi_families:
run_command_with_search(
label="amd-smi firmware",
command="amd-smi",
args=["firmware"],
extra_command_search_paths=[bin_dir],
)

log("\n=== End of sanity check ===")

Expand Down
95 changes: 95 additions & 0 deletions tests/hip_simple_check.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Copyright Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT

// Minimal HIP kernel WITHOUT printf, to isolate whether the gfx1150
// sanity-test hang (TheRock#3199) is caused by printf buffer handling
// or by kernel dispatch itself.
//
// Includes a SIGALRM watchdog that dumps a backtrace + /proc/self/maps
// if the process hangs for 20 seconds.

#include <csignal>
#include <cstdio>
#include <cstdlib>
#include <execinfo.h>
#include <hip/hip_runtime.h>
#include <unistd.h>

static void dump_backtrace_and_maps(int sig) {
fprintf(stderr, "\n=== WATCHDOG: process hung for 20s (signal %d) ===\n",
sig);

// Dump backtrace addresses
void *frames[64];
int n = backtrace(frames, 64);
fprintf(stderr, "=== backtrace (%d frames) ===\n", n);
backtrace_symbols_fd(frames, n, STDERR_FILENO);

// Dump /proc/self/maps so addresses can be resolved offline
fprintf(stderr, "\n=== /proc/self/maps ===\n");
FILE *maps = fopen("/proc/self/maps", "r");
if (maps) {
char buf[512];
while (fgets(buf, sizeof(buf), maps))
fputs(buf, stderr);
fclose(maps);
}

// Dump /proc/self/stack (kernel stack, if readable)
fprintf(stderr, "\n=== /proc/self/stack ===\n");
FILE *stack = fopen("/proc/self/stack", "r");
if (stack) {
char buf[512];
while (fgets(buf, sizeof(buf), stack))
fputs(buf, stderr);
fclose(stack);
} else {
fprintf(stderr, "(not readable)\n");
}

_exit(2);
}

__global__ void squares_no_printf(int *buf) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
buf[i] = i * i;
}

int main() {
// Set up watchdog: SIGALRM after 20s
signal(SIGALRM, dump_backtrace_and_maps);
alarm(20);

constexpr int gridsize = 1;
constexpr int blocksize = 64;
constexpr int size = gridsize * blocksize;
int *d_buf;
fprintf(stderr, "hip_simple_check: hipHostMalloc\n");
hipHostMalloc(&d_buf, size * sizeof(int));
fprintf(stderr, "hip_simple_check: hipLaunchKernelGGL\n");
hipLaunchKernelGGL(squares_no_printf, gridsize, blocksize, 0, 0, d_buf);
fprintf(stderr, "hip_simple_check: hipDeviceSynchronize\n");
hipDeviceSynchronize();

// Cancel watchdog — we didn't hang
alarm(0);
fprintf(stderr, "hip_simple_check: checking results\n");

int mismatches_count = 0;
for (int i = 0; i < size; ++i) {
int square = i * i;
if (d_buf[i] != square) {
fprintf(stderr,
"Element at index %d expected value %d, actual value: %d\n", i,
square, d_buf[i]);
++mismatches_count;
}
}
if (mismatches_count > 0) {
fprintf(stderr, "There were %d mismatches\n", mismatches_count);
return 1;
}

fprintf(stderr, "hip_simple_check: PASSED\n");
return 0;
}
141 changes: 130 additions & 11 deletions tests/test_rocm_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shlex
import subprocess
import sys
import time

THIS_DIR = Path(__file__).resolve().parent

Expand All @@ -29,19 +30,23 @@ def is_windows():
return "windows" == platform.system().lower()


def run_command(command: list[str], cwd=None):
def run_command(command: list[str], cwd=None, capture=True):
logger.info(f"++ Run [{cwd}]$ {shlex.join(command)}")
process = subprocess.run(
command, capture_output=True, cwd=cwd, shell=is_windows(), text=True
command,
capture_output=capture,
cwd=cwd,
shell=is_windows(),
text=True,
)
if process.returncode != 0:
logger.error(f"Command failed!")
logger.error("command stdout:")
for line in process.stdout.splitlines():
logger.error(line)
logger.error("command stderr:")
for line in process.stderr.splitlines():
logger.error(line)
if capture:
logger.error("command stdout:")
for line in process.stdout.splitlines():
logger.error(line)
logger.error("command stderr:")
for line in process.stderr.splitlines():
logger.error(line)
raise Exception(f"Command failed: `{shlex.join(command)}`, see output above")
return process

Expand Down Expand Up @@ -82,6 +87,116 @@ def test_rocm_output(self, rocm_info_output, to_search):
f"Failed to search for {to_search} in rocminfo output",
)

# TODO(#3313): Re-enable once hipcc test is fixed for ASAN builds
@pytest.mark.skipif(
is_asan(), reason="hipcc test fails with ASAN build, see TheRock#3313"
)
def test_hip_simple(self):
"""Minimal kernel without printf to isolate gfx1150 hang (TheRock#3199)."""
platform_executable_suffix = ".exe" if is_windows() else ""
offload_arch_path = (
THEROCK_BIN_DIR
/ ".."
/ "lib"
/ "llvm"
/ "bin"
/ f"offload-arch{platform_executable_suffix}"
).resolve()
process = run_command([str(offload_arch_path)])
offload_arch = None
for line in process.stdout.splitlines():
if "gfx" in line:
offload_arch = line
break
assert (
offload_arch is not None
), f"Expected offload-arch to return gfx####, got:\n{process.stdout}"

executable = f"hip_simple_check{platform_executable_suffix}"
run_command(
[
f"{THEROCK_BIN_DIR}/hipcc",
str(THIS_DIR / "hip_simple_check.cpp"),
"-Xlinker",
f"-rpath={THEROCK_BIN_DIR}/../lib/",
f"--offload-arch={offload_arch}",
"-o",
executable,
],
cwd=str(THEROCK_BIN_DIR),
)

# Dump kernel log to capture GPU firmware errors that may cause
# all subsequent kernel launches to stall. Try /dev/kmsg first
# (bypasses dmesg_restrict), then fall back to dmesg.
def dump_kmsg(label, last_n=50):
# Try /dev/kmsg (non-blocking read of kernel ring buffer)
try:
with open("/dev/kmsg", "r", errors="replace") as f:
import fcntl

# Set non-blocking so we don't hang
fd = f.fileno()
flags = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
lines = []
try:
while True:
line = f.readline()
if not line:
break
lines.append(line.rstrip())
except (BlockingIOError, OSError):
pass
if lines:
logger.info(f"=== {label} (last {last_n} from /dev/kmsg) ===")
for line in lines[-last_n:]:
logger.info(line)
return
except (PermissionError, OSError):
pass

# Fallback: dmesg command
try:
result = subprocess.run(
["dmesg", "-T"],
capture_output=True,
text=True,
timeout=10,
)
if result.stdout.strip():
logger.info(f"=== {label} (last {last_n} from dmesg) ===")
for line in result.stdout.splitlines()[-last_n:]:
logger.info(line)
else:
logger.warning(f"{label}: dmesg returned empty output")
except Exception as e:
logger.warning(f"{label}: dmesg failed: {e}")

dump_kmsg("kernel log before hip_simple_check", last_n=50)

# The C++ binary has a built-in SIGALRM watchdog at 20s that
# dumps backtrace + /proc/self/maps + /proc/self/stack to stderr
# and exits with code 2. No need for external gdb/strace.
platform_executable_prefix = "./" if not is_windows() else ""
exe = f"{platform_executable_prefix}{executable}"
logger.info(f"++ Run [{THEROCK_BIN_DIR}]$ {exe}")
proc = subprocess.Popen(
[exe],
cwd=str(THEROCK_BIN_DIR),
)

hang_timeout = 30
try:
proc.wait(timeout=hang_timeout)
except subprocess.TimeoutExpired:
logger.error(f"hip_simple_check hung for {hang_timeout}s, killing")
dump_kmsg("kernel log after hang", last_n=100)
proc.kill()
proc.wait()

check.equal(proc.returncode, 0)

# TODO(#3313): Re-enable once hipcc test is fixed for ASAN builds
@pytest.mark.skipif(
is_asan(), reason="hipcc test fails with ASAN build, see TheRock#3313"
Expand Down Expand Up @@ -139,10 +254,14 @@ def test_hip_printf(self):
cwd=str(THEROCK_BIN_DIR),
)

# Running and checking the executable
# Running and checking the executable.
# capture=False so HIP debug output streams directly to the log
# if the process hangs (see TheRock#3199).
platform_executable_prefix = "./" if not is_windows() else ""
hipcc_check_executable = f"{platform_executable_prefix}hipcc_check"
process = run_command([hipcc_check_executable], cwd=str(THEROCK_BIN_DIR))
process = run_command(
[hipcc_check_executable], cwd=str(THEROCK_BIN_DIR), capture=False
)
check.equal(process.returncode, 0)
check.greater(
os.path.getsize(str(THEROCK_BIN_DIR / hipcc_check_executable_file)), 0
Expand Down
Loading