-
Notifications
You must be signed in to change notification settings - Fork 228
Expand file tree
/
Copy pathprint_driver_gpu_info.py
More file actions
152 lines (126 loc) · 3.97 KB
/
print_driver_gpu_info.py
File metadata and controls
152 lines (126 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
# Copyright Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
"""
Sanity check script for CI runners.
On Linux:
- run "amd-smi static"
- run "rocminfo"
On Windows:
- run "hipInfo.exe"
This script prints only raw command output.
"""
import os
from pathlib import Path
import platform
import shlex
import shutil
import subprocess
import sys
from typing import List, Optional
AMDGPU_FAMILIES = os.getenv("AMDGPU_FAMILIES")
# TODO(#2964): Remove gfx950-dcgpu once amdsmi static does not timeout
unsupported_amdsmi_families = ["gfx1151", "gfx950-dcgpu"]
def log(*args, **kwargs):
print(*args, **kwargs)
sys.stdout.flush()
def run_command(args: List[str | Path], cwd: Optional[Path] = None) -> None:
args = [str(arg) for arg in args]
if cwd is None:
cwd = Path.cwd()
log(f"++ Exec [{cwd}]$ {shlex.join(args)}")
try:
proc = subprocess.run(
args,
cwd=str(cwd),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
check=False,
stdin=subprocess.DEVNULL,
)
log(proc.stdout.rstrip())
except FileNotFoundError:
log(f"{args[0]}: command not found")
def run_command_with_search(
label: str,
command: str,
args: List[str],
extra_command_search_paths: List[Path],
) -> None:
"""
Run a command, searching in extra paths first, then PATH.
Example:
run_command_with_search(
label="amd-smi static",
command="amd-smi",
args=["static"],
extra_command_search_paths=[bin_dir],
)
"""
# Try explicit directories first (e.g. THEROCK_DIR/build/bin)
for base in extra_command_search_paths:
candidate = base / command
if candidate.exists():
log(f"\n=== {label} ===")
run_command([candidate] + args)
return
# Then fall back to PATH
resolved = shutil.which(command)
if resolved:
log(f"\n=== {label} ===")
run_command([resolved] + args)
return
# Nothing found
log(f"\n=== {label} ===")
log(f"{command}: command not found")
def run_sanity(os_name: str) -> None:
THIS_SCRIPT_DIR = Path(__file__).resolve().parent
THEROCK_DIR = THIS_SCRIPT_DIR.parent
bin_dir = Path(os.getenv("THEROCK_BIN_DIR", THEROCK_DIR / "build" / "bin"))
log("=== Sanity check: driver / GPU info ===")
if os_name.lower() == "windows":
# Windows: only hipInfo.exe
run_command_with_search(
label="hipInfo.exe",
command="hipInfo.exe",
args=[],
extra_command_search_paths=[bin_dir],
)
else:
# Linux: amd-smi static + rocminfo
# TODO(#2789): Remove conditional once amdsmi supports gfx1151
if AMDGPU_FAMILIES not in unsupported_amdsmi_families:
run_command_with_search(
label="amd-smi static",
command="amd-smi",
args=["static"],
extra_command_search_paths=[bin_dir],
)
run_command_with_search(
label="rocminfo",
command="rocminfo",
args=[],
extra_command_search_paths=[bin_dir],
)
run_command_with_search(
label="Kernel version",
command="uname",
args=["-r"],
extra_command_search_paths=[bin_dir],
)
# Print per-component firmware versions (useful for debugging hangs)
if AMDGPU_FAMILIES not in unsupported_amdsmi_families:
run_command_with_search(
label="amd-smi firmware",
command="amd-smi",
args=["firmware"],
extra_command_search_paths=[bin_dir],
)
log("\n=== End of sanity check ===")
def main(argv: Optional[List[str]] = None) -> int:
detected = platform.system()
run_sanity(detected)
return 0
if __name__ == "__main__":
sys.exit(main())