diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index c8f00f8e3f..09f64f49b9 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -270,11 +270,9 @@ }, "gfx1150": { "linux": { - # TODO(#3199): Re-enable machine once it is stable - # Label is "linux-gfx1150-gpu-rocm" - "test-runs-on": "", + "test-runs-on": "linux-gfx1150-gpu-rocm", "family": "gfx1150", - "fetch-gfx-targets": [], + "fetch-gfx-targets": ["gfx1150"], "build_variants": ["release"], "sanity_check_only_for_family": True, }, diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index edeee3e639..509946d7f5 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -49,7 +49,7 @@ def _get_script_path(script_name: str) -> str: "sanity": { "job_name": "sanity", "fetch_artifact_args": "--base-only", - "timeout_minutes": 5, + "timeout_minutes": 15, "test_script": f"python {_get_script_path('test_sanity.py')}", "platform": ["linux", "windows"], "total_shards_dict": { @@ -58,7 +58,7 @@ def _get_script_path(script_name: str) -> str: }, # Running docker with cap-add and -v /lib/modules, by recommendation of GitHub: # https://rocm.docs.amd.com/projects/amdsmi/en/amd-staging/how-to/setup-docker-container.html - "container_options": "--cap-add SYS_MODULE -v /lib/modules:/lib/modules", + "container_options": "--cap-add SYS_MODULE --cap-add SYS_PTRACE -v /lib/modules:/lib/modules", }, # hip-tests "hip-tests": { diff --git a/build_tools/github_actions/new_amdgpu_family_matrix.py b/build_tools/github_actions/new_amdgpu_family_matrix.py index a3e286b59c..b63bafcf7d 100644 --- a/build_tools/github_actions/new_amdgpu_family_matrix.py +++ b/build_tools/github_actions/new_amdgpu_family_matrix.py @@ -187,12 +187,11 @@ "build_variants": ["release"], }, "test": { - # TODO(#3199): Re-enable machine once it is stable - "run_tests": False, + "run_tests": True, "runs_on": { "test": "linux-gfx1150-gpu-rocm", }, - "fetch-gfx-targets": [], + "fetch-gfx-targets": ["gfx1150"], "sanity_check_only_for_family": True, }, "release": { diff --git a/build_tools/github_actions/test_executable_scripts/test_sanity.py b/build_tools/github_actions/test_executable_scripts/test_sanity.py index e54a0e2060..4a30e729f4 100644 --- a/build_tools/github_actions/test_executable_scripts/test_sanity.py +++ b/build_tools/github_actions/test_executable_scripts/test_sanity.py @@ -16,7 +16,13 @@ # Enable verbose ROCm logging, see # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html # Note: ROCM_KPACK_DEBUG is set for all components by test_component.yml. -env["AMD_LOG_LEVEL"] = "4" +env["AMD_LOG_LEVEL"] = "7" +# Extra HIP/HSA diagnostics for debugging gfx1150 hang (TheRock#3199) +env["HIP_TRACE_API"] = "1" +env["HIP_LAUNCH_BLOCKING"] = "1" +env["AMD_SERIALIZE_KERNEL"] = "3" +env["AMD_SERIALIZE_COPY"] = "3" +env["HSA_ENABLE_SDMA"] = "0" # The sanity checks run tools like 'offload-arch' which may search for DLLs on # multiple search paths (PATH, CWD, system32, etc.). diff --git a/build_tools/print_driver_gpu_info.py b/build_tools/print_driver_gpu_info.py index 5d90007f7d..ebeffe598c 100644 --- a/build_tools/print_driver_gpu_info.py +++ b/build_tools/print_driver_gpu_info.py @@ -130,6 +130,14 @@ def run_sanity(os_name: str) -> None: args=["-r"], extra_command_search_paths=[bin_dir], ) + # Print per-component firmware versions (useful for debugging hangs) + if AMDGPU_FAMILIES not in unsupported_amdsmi_families: + run_command_with_search( + label="amd-smi firmware", + command="amd-smi", + args=["firmware"], + extra_command_search_paths=[bin_dir], + ) log("\n=== End of sanity check ===") diff --git a/tests/hip_simple_check.cpp b/tests/hip_simple_check.cpp new file mode 100644 index 0000000000..fe661c6981 --- /dev/null +++ b/tests/hip_simple_check.cpp @@ -0,0 +1,95 @@ +// Copyright Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +// Minimal HIP kernel WITHOUT printf, to isolate whether the gfx1150 +// sanity-test hang (TheRock#3199) is caused by printf buffer handling +// or by kernel dispatch itself. +// +// Includes a SIGALRM watchdog that dumps a backtrace + /proc/self/maps +// if the process hangs for 20 seconds. + +#include +#include +#include +#include +#include +#include + +static void dump_backtrace_and_maps(int sig) { + fprintf(stderr, "\n=== WATCHDOG: process hung for 20s (signal %d) ===\n", + sig); + + // Dump backtrace addresses + void *frames[64]; + int n = backtrace(frames, 64); + fprintf(stderr, "=== backtrace (%d frames) ===\n", n); + backtrace_symbols_fd(frames, n, STDERR_FILENO); + + // Dump /proc/self/maps so addresses can be resolved offline + fprintf(stderr, "\n=== /proc/self/maps ===\n"); + FILE *maps = fopen("/proc/self/maps", "r"); + if (maps) { + char buf[512]; + while (fgets(buf, sizeof(buf), maps)) + fputs(buf, stderr); + fclose(maps); + } + + // Dump /proc/self/stack (kernel stack, if readable) + fprintf(stderr, "\n=== /proc/self/stack ===\n"); + FILE *stack = fopen("/proc/self/stack", "r"); + if (stack) { + char buf[512]; + while (fgets(buf, sizeof(buf), stack)) + fputs(buf, stderr); + fclose(stack); + } else { + fprintf(stderr, "(not readable)\n"); + } + + _exit(2); +} + +__global__ void squares_no_printf(int *buf) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + buf[i] = i * i; +} + +int main() { + // Set up watchdog: SIGALRM after 20s + signal(SIGALRM, dump_backtrace_and_maps); + alarm(20); + + constexpr int gridsize = 1; + constexpr int blocksize = 64; + constexpr int size = gridsize * blocksize; + int *d_buf; + fprintf(stderr, "hip_simple_check: hipHostMalloc\n"); + hipHostMalloc(&d_buf, size * sizeof(int)); + fprintf(stderr, "hip_simple_check: hipLaunchKernelGGL\n"); + hipLaunchKernelGGL(squares_no_printf, gridsize, blocksize, 0, 0, d_buf); + fprintf(stderr, "hip_simple_check: hipDeviceSynchronize\n"); + hipDeviceSynchronize(); + + // Cancel watchdog — we didn't hang + alarm(0); + fprintf(stderr, "hip_simple_check: checking results\n"); + + int mismatches_count = 0; + for (int i = 0; i < size; ++i) { + int square = i * i; + if (d_buf[i] != square) { + fprintf(stderr, + "Element at index %d expected value %d, actual value: %d\n", i, + square, d_buf[i]); + ++mismatches_count; + } + } + if (mismatches_count > 0) { + fprintf(stderr, "There were %d mismatches\n", mismatches_count); + return 1; + } + + fprintf(stderr, "hip_simple_check: PASSED\n"); + return 0; +} diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index bcdfc35baf..8759f2415b 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -11,6 +11,7 @@ import shlex import subprocess import sys +import time THIS_DIR = Path(__file__).resolve().parent @@ -29,19 +30,23 @@ def is_windows(): return "windows" == platform.system().lower() -def run_command(command: list[str], cwd=None): +def run_command(command: list[str], cwd=None, capture=True): logger.info(f"++ Run [{cwd}]$ {shlex.join(command)}") process = subprocess.run( - command, capture_output=True, cwd=cwd, shell=is_windows(), text=True + command, + capture_output=capture, + cwd=cwd, + shell=is_windows(), + text=True, ) if process.returncode != 0: - logger.error(f"Command failed!") - logger.error("command stdout:") - for line in process.stdout.splitlines(): - logger.error(line) - logger.error("command stderr:") - for line in process.stderr.splitlines(): - logger.error(line) + if capture: + logger.error("command stdout:") + for line in process.stdout.splitlines(): + logger.error(line) + logger.error("command stderr:") + for line in process.stderr.splitlines(): + logger.error(line) raise Exception(f"Command failed: `{shlex.join(command)}`, see output above") return process @@ -82,6 +87,116 @@ def test_rocm_output(self, rocm_info_output, to_search): f"Failed to search for {to_search} in rocminfo output", ) + # TODO(#3313): Re-enable once hipcc test is fixed for ASAN builds + @pytest.mark.skipif( + is_asan(), reason="hipcc test fails with ASAN build, see TheRock#3313" + ) + def test_hip_simple(self): + """Minimal kernel without printf to isolate gfx1150 hang (TheRock#3199).""" + platform_executable_suffix = ".exe" if is_windows() else "" + offload_arch_path = ( + THEROCK_BIN_DIR + / ".." + / "lib" + / "llvm" + / "bin" + / f"offload-arch{platform_executable_suffix}" + ).resolve() + process = run_command([str(offload_arch_path)]) + offload_arch = None + for line in process.stdout.splitlines(): + if "gfx" in line: + offload_arch = line + break + assert ( + offload_arch is not None + ), f"Expected offload-arch to return gfx####, got:\n{process.stdout}" + + executable = f"hip_simple_check{platform_executable_suffix}" + run_command( + [ + f"{THEROCK_BIN_DIR}/hipcc", + str(THIS_DIR / "hip_simple_check.cpp"), + "-Xlinker", + f"-rpath={THEROCK_BIN_DIR}/../lib/", + f"--offload-arch={offload_arch}", + "-o", + executable, + ], + cwd=str(THEROCK_BIN_DIR), + ) + + # Dump kernel log to capture GPU firmware errors that may cause + # all subsequent kernel launches to stall. Try /dev/kmsg first + # (bypasses dmesg_restrict), then fall back to dmesg. + def dump_kmsg(label, last_n=50): + # Try /dev/kmsg (non-blocking read of kernel ring buffer) + try: + with open("/dev/kmsg", "r", errors="replace") as f: + import fcntl + + # Set non-blocking so we don't hang + fd = f.fileno() + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK) + lines = [] + try: + while True: + line = f.readline() + if not line: + break + lines.append(line.rstrip()) + except (BlockingIOError, OSError): + pass + if lines: + logger.info(f"=== {label} (last {last_n} from /dev/kmsg) ===") + for line in lines[-last_n:]: + logger.info(line) + return + except (PermissionError, OSError): + pass + + # Fallback: dmesg command + try: + result = subprocess.run( + ["dmesg", "-T"], + capture_output=True, + text=True, + timeout=10, + ) + if result.stdout.strip(): + logger.info(f"=== {label} (last {last_n} from dmesg) ===") + for line in result.stdout.splitlines()[-last_n:]: + logger.info(line) + else: + logger.warning(f"{label}: dmesg returned empty output") + except Exception as e: + logger.warning(f"{label}: dmesg failed: {e}") + + dump_kmsg("kernel log before hip_simple_check", last_n=50) + + # The C++ binary has a built-in SIGALRM watchdog at 20s that + # dumps backtrace + /proc/self/maps + /proc/self/stack to stderr + # and exits with code 2. No need for external gdb/strace. + platform_executable_prefix = "./" if not is_windows() else "" + exe = f"{platform_executable_prefix}{executable}" + logger.info(f"++ Run [{THEROCK_BIN_DIR}]$ {exe}") + proc = subprocess.Popen( + [exe], + cwd=str(THEROCK_BIN_DIR), + ) + + hang_timeout = 30 + try: + proc.wait(timeout=hang_timeout) + except subprocess.TimeoutExpired: + logger.error(f"hip_simple_check hung for {hang_timeout}s, killing") + dump_kmsg("kernel log after hang", last_n=100) + proc.kill() + proc.wait() + + check.equal(proc.returncode, 0) + # TODO(#3313): Re-enable once hipcc test is fixed for ASAN builds @pytest.mark.skipif( is_asan(), reason="hipcc test fails with ASAN build, see TheRock#3313" @@ -139,10 +254,14 @@ def test_hip_printf(self): cwd=str(THEROCK_BIN_DIR), ) - # Running and checking the executable + # Running and checking the executable. + # capture=False so HIP debug output streams directly to the log + # if the process hangs (see TheRock#3199). platform_executable_prefix = "./" if not is_windows() else "" hipcc_check_executable = f"{platform_executable_prefix}hipcc_check" - process = run_command([hipcc_check_executable], cwd=str(THEROCK_BIN_DIR)) + process = run_command( + [hipcc_check_executable], cwd=str(THEROCK_BIN_DIR), capture=False + ) check.equal(process.returncode, 0) check.greater( os.path.getsize(str(THEROCK_BIN_DIR / hipcc_check_executable_file)), 0