Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions python/flydsl/compiler/backends/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,22 @@ def _format_pass_opts(opts: dict) -> str:
"""Format {key: value, ...} as 'key=value key2=value2' for MLIR pass options."""
return " ".join(f"{k}={v}" for k, v in opts.items())

def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
chip = self.target.arch
def _bin_cli_opts(self, *, compile_hints: dict) -> List[str]:
waves_per_eu = compile_hints.get("waves_per_eu")
maxnreg = compile_hints.get("maxnreg")

bin_cli_opts = []
if env.debug.enable_debug_info:
bin_cli_opts.append("-g")
if waves_per_eu:
bin_cli_opts.append(f"--amdgpu-waves-per-eu={waves_per_eu}")
if maxnreg:
bin_cli_opts.append(f"--amdgpu-num-vgpr={maxnreg}")
return bin_cli_opts

rocdl_opts = {
"O": 2,
def _rocdl_opts(self, *, compile_hints: dict, opt_level: int = 2) -> dict:
chip = self.target.arch
return {
"O": opt_level,
"abi": 600,
"chip": chip,
"correct-sqrt": "true",
Expand All @@ -61,6 +62,24 @@ def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
"wave64": "false" if is_rdna_arch(chip) else "true",
}

def llvm_recodegen_fragments(self, *, compile_hints: dict, opt_level: int = 0) -> Tuple[str, str]:
"""Fragments to re-codegen an already-LLVM-dialect ``gpu.module`` that has
NO target attached: attach a ROCDL target at ``opt_level`` then emit the
device binary. Used by the custom-LLVM-pass path, which has already run
its own ``opt`` pipeline, so codegen runs at ``O=0`` to avoid re-optimizing.
"""
rocdl_opts = self._rocdl_opts(compile_hints=compile_hints, opt_level=opt_level)
bin_cli_opts = self._bin_cli_opts(compile_hints=compile_hints)
attach_fragment = f"rocdl-attach-target{{{self._format_pass_opts(rocdl_opts)}}}"
binary_fragment = f'gpu-module-to-binary{{format=fatbin opts="{" ".join(bin_cli_opts)}"}}'
return attach_fragment, binary_fragment

def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
chip = self.target.arch

bin_cli_opts = self._bin_cli_opts(compile_hints=compile_hints)
rocdl_opts = self._rocdl_opts(compile_hints=compile_hints, opt_level=2)

pre_binary_fragments = [
"fly-rewrite-func-signature",
"fly-canonicalize",
Expand Down
266 changes: 266 additions & 0 deletions python/flydsl/compiler/external_llvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,269 @@ def run_mlir_opt(*, pass_pipeline: str, input_path: Path, output_path: Path) ->
finally:
if tmp_dir_obj is not None:
tmp_dir_obj.cleanup()


def llvm_opt_fingerprint(pipeline: str, plugins: Optional[list] = None) -> str:
"""Cache fingerprint for a custom LLVM-opt configuration: the pipeline
string plus each plugin's path and content hash, so editing a plugin .so
(or the pipeline) invalidates cached artifacts."""
parts = [f"llvm-opt:{pipeline}"]
for p in plugins or []:
path = Path(p).expanduser()
try:
parts.append(f"{path}:{_file_hash(path.resolve())}")
except OSError:
parts.append(f"{path}:<missing>")
return ";".join(parts)


def _run_tool(cmd: list, *, prefix: Path, what: str, work_dir: Path) -> None:
try:
subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=600, env=_subprocess_env(prefix))
except subprocess.TimeoutExpired as exc:
raise ExternalLLVMError(
f"{what} timed out after 600s.\ncommand: {' '.join(cmd)}\nwork_dir: {work_dir}"
) from exc
except subprocess.CalledProcessError as exc:
raise ExternalLLVMError(
f"{what} failed.\nllvm_dir: {prefix}\ncommand: {' '.join(cmd)}\n"
f"work_dir: {work_dir}\nstdout:\n{exc.stdout}\nstderr:\n{exc.stderr}"
) from exc


def run_llvm_opt_then_binary(
module: ir.Module,
*,
llvm_ir: str,
attach_fragment: str,
binary_fragment: str,
pipeline: str,
plugins: Optional[list] = None,
llvm_options: Optional[dict] = None,
work_dir: Optional[Path] = None,
stage_prefix: str = "llvm_opt",
) -> None:
"""Run a custom LLVM new-PM pass pipeline on the device kernel's (pre-link)
LLVM IR, then re-codegen the device binary and splice it back into *module*.

Flow: ``opt --passes`` (with optional ``--load-pass-plugin``) on ``llvm_ir``
-> ``mlir-translate --import-llvm`` -> wrap into a ``gpu.module`` -> external
``mlir-opt`` running ``attach_fragment`` (ROCDL target at O=0) then
``binary_fragment`` (``gpu-module-to-binary``) -> replace the in-process
``gpu.module`` with the produced ``gpu.binary``.
"""
prefix = _llvm_dir()
opt = _tool(prefix, "opt")
mlir_translate = _tool(prefix, "mlir-translate")
mlir_opt = _tool(prefix, "mlir-opt")

gpu_module = _single_top_level_op(module, "gpu.module")
name = _symbol_name(gpu_module)
data_layout = None
if "llvm.data_layout" in gpu_module.attributes:
try:
data_layout = ir.StringAttr(gpu_module.attributes["llvm.data_layout"]).value
except Exception:
data_layout = None

llvm_cli_args = _format_llvm_cli_options(llvm_options) if llvm_options else []

tmp_dir_obj = None
if work_dir is None:
tmp_dir_obj = tempfile.TemporaryDirectory(prefix="flydsl_llvm_opt_")
work_dir = Path(tmp_dir_obj.name)
else:
work_dir.mkdir(parents=True, exist_ok=True)

in_ll = work_dir / f"{stage_prefix}_pre_opt.ll"
out_ll = work_dir / f"{stage_prefix}_post_opt.ll"
imported_path = work_dir / f"{stage_prefix}_imported.mlir"
wrapped_path = work_dir / f"{stage_prefix}_wrapped.mlir"
bin_path = work_dir / f"{stage_prefix}_binary.mlir"

try:
in_ll.write_text(llvm_ir, encoding="utf-8")

plugin_args = [f"--load-pass-plugin={Path(p).expanduser()}" for p in (plugins or [])]
_run_tool(
[str(opt), str(in_ll), "-S", f"--passes={pipeline}", *plugin_args, *llvm_cli_args, "-o", str(out_ll)],
prefix=prefix,
what="LLVM opt pass pipeline",
work_dir=work_dir,
)

_run_tool(
[str(mlir_translate), "--import-llvm", str(out_ll), "-o", str(imported_path)],
prefix=prefix,
what="mlir-translate --import-llvm",
work_dir=work_dir,
)

# Wrap the re-imported LLVM-dialect IR back into a gpu.module (no target;
# attach_fragment adds it). The original gpu.module's data layout is
# re-applied; gpu-module-to-binary will produce gpu.binary @<name>.
imported = ir.Module.parse(imported_path.read_text(encoding="utf-8"), context=module.context)
body = "\n".join(op.operation.get_asm() for op in imported.body.operations)
dl_attr = f' attributes {{llvm.data_layout = "{data_layout}"}}' if data_layout else ""
wrapped_path.write_text(
f"module attributes {{gpu.container_module}} {{\n" f" gpu.module @{name}{dl_attr} {{\n{body}\n }}\n}}\n",
encoding="utf-8",
)

_run_tool(
[
str(mlir_opt),
str(wrapped_path),
f"--pass-pipeline=builtin.module({attach_fragment},{binary_fragment})",
*llvm_cli_args,
"-o",
str(bin_path),
],
prefix=prefix,
what="external gpu-module-to-binary codegen",
work_dir=work_dir,
)

if not bin_path.is_file():
raise ExternalLLVMError(f"external codegen did not create output file: {bin_path}")
binary_module = ir.Module.parse(bin_path.read_text(encoding="utf-8"), context=module.context)
_replace_gpu_module_with_binary_op(module, binary_module)
finally:
if tmp_dir_obj is not None:
tmp_dir_obj.cleanup()


# ---------------------------------------------------------------------------
# Custom-codegen path: fly-llc (IR -> obj with injectable MIR passes) + ld.lld
# ---------------------------------------------------------------------------


def _fly_llc_path() -> Path:
raw = env.compile.fly_llc.strip()
if raw:
return Path(raw).expanduser()
cand = _llvm_dir() / "bin" / "fly-llc"
if cand.is_file():
return cand
raise ExternalLLVMError(
"fly-llc tool not found: set FLYDSL_COMPILE_FLY_LLC or build fly-llc into <FLYDSL_COMPILE_LLVM_DIR>/bin."
)


def _lld_path() -> Path:
raw = env.compile.lld.strip()
if raw:
return Path(raw).expanduser()
cand = _llvm_dir() / "bin" / "ld.lld"
if cand.is_file():
return cand
raise ExternalLLVMError(
"fly-llc codegen path needs ld.lld: set FLYDSL_COMPILE_LLD or place ld.lld in <FLYDSL_COMPILE_LLVM_DIR>/bin."
)


def fly_llc_codegen_fingerprint(passes: Optional[list] = None, plugins: Optional[list] = None) -> str:
"""Cache fingerprint for a fly-llc codegen configuration: the pass names plus
the fly-llc binary's and each plugin's content hash."""
parts = ["fly-llc-codegen:" + ",".join(passes or [])]
try:
parts.append(_file_hash(_fly_llc_path().resolve()))
except OSError:
parts.append("<no-fly-llc>")
except ExternalLLVMError:
parts.append("<no-fly-llc>")
for p in plugins or []:
path = Path(p).expanduser()
try:
parts.append(f"{path}:{_file_hash(path.resolve())}")
except OSError:
parts.append(f"{path}:<missing>")
return ";".join(parts)


def _gpu_binary_module_text(name: str, target_cpu: str, hsaco: bytes) -> str:
"""Build a ``builtin.module`` text embedding *hsaco* as a ``gpu.binary @name``
(every byte escaped as ``\\XX`` for the MLIR string attribute)."""
esc = "".join("\\%02X" % b for b in hsaco)
return (
"module attributes {gpu.container_module} {\n"
f' gpu.binary @{name} [#gpu.object<#rocdl.target<chip = "{target_cpu}">, kernels = <>, bin = "{esc}">]\n'
"}\n"
)


def run_fly_llc_codegen(
module: ir.Module,
*,
llvm_ir: str,
codegen_passes: list,
codegen_plugins: Optional[list] = None,
target_triple: str,
target_cpu: str,
work_dir: Optional[Path] = None,
stage_prefix: str = "fly_llc",
) -> None:
"""Codegen the device kernel's LLVM IR with injectable MIR passes and splice
the result back into *module*.

Flow: ``fly-llc <in.ll> -o <obj> --load=<plugin> --pre-emit-pass=<pass>``
(custom MIR passes run pre-emit in the standard codegen) -> ``ld.lld -shared``
-> wrap the HSACO bytes into a ``gpu.binary`` -> replace the in-process
``gpu.module``.
"""
fly_llc = _fly_llc_path()
lld = _lld_path()
prefix = _llvm_dir()

gpu_module = _single_top_level_op(module, "gpu.module")
name = _symbol_name(gpu_module)

tmp_dir_obj = None
if work_dir is None:
tmp_dir_obj = tempfile.TemporaryDirectory(prefix="flydsl_fly_llc_")
work_dir = Path(tmp_dir_obj.name)
else:
work_dir.mkdir(parents=True, exist_ok=True)

in_ll = work_dir / f"{stage_prefix}_pre_codegen.ll"
obj = work_dir / f"{stage_prefix}.o"
hsaco = work_dir / f"{stage_prefix}.hsaco"
bin_mlir = work_dir / f"{stage_prefix}_binary.mlir"

try:
in_ll.write_text(llvm_ir, encoding="utf-8")

plugin_args = [f"--load={Path(p).expanduser()}" for p in (codegen_plugins or [])]
pass_args = [f"--pre-emit-pass={n}" for n in (codegen_passes or [])]
_run_tool(
[
str(fly_llc),
str(in_ll),
"-o",
str(obj),
f"-mtriple={target_triple}",
f"-mcpu={target_cpu}",
*plugin_args,
*pass_args,
],
prefix=prefix,
what="fly-llc codegen",
work_dir=work_dir,
)

_run_tool(
[str(lld), "-shared", str(obj), "-o", str(hsaco)],
prefix=prefix,
what="ld.lld HSACO link",
work_dir=work_dir,
)

if not hsaco.is_file():
raise ExternalLLVMError(f"ld.lld did not create HSACO: {hsaco}")
text = _gpu_binary_module_text(name, target_cpu, hsaco.read_bytes())
bin_mlir.write_text(text, encoding="utf-8")
binary_module = ir.Module.parse(text, context=module.context)
_replace_gpu_module_with_binary_op(module, binary_module)
finally:
if tmp_dir_obj is not None:
tmp_dir_obj.cleanup()
Loading
Loading