ROCm · fsx950223 · Jun 2, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026
diff --git a/python/flydsl/compiler/backends/rocm.py b/python/flydsl/compiler/backends/rocm.py
@@ -33,21 +33,22 @@ def _format_pass_opts(opts: dict) -> str:
         """Format {key: value, ...} as 'key=value key2=value2' for MLIR pass options."""
         return " ".join(f"{k}={v}" for k, v in opts.items())
 
-    def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
-        chip = self.target.arch
+    def _bin_cli_opts(self, *, compile_hints: dict) -> List[str]:
         waves_per_eu = compile_hints.get("waves_per_eu")
         maxnreg = compile_hints.get("maxnreg")
-
         bin_cli_opts = []
         if env.debug.enable_debug_info:
             bin_cli_opts.append("-g")
         if waves_per_eu:
             bin_cli_opts.append(f"--amdgpu-waves-per-eu={waves_per_eu}")
         if maxnreg:
             bin_cli_opts.append(f"--amdgpu-num-vgpr={maxnreg}")
+        return bin_cli_opts
 
-        rocdl_opts = {
-            "O": 2,
+    def _rocdl_opts(self, *, compile_hints: dict, opt_level: int = 2) -> dict:
+        chip = self.target.arch
+        return {
+            "O": opt_level,
             "abi": 600,
             "chip": chip,
             "correct-sqrt": "true",
@@ -61,6 +62,24 @@ def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
             "wave64": "false" if is_rdna_arch(chip) else "true",
         }
 
+    def llvm_recodegen_fragments(self, *, compile_hints: dict, opt_level: int = 0) -> Tuple[str, str]:
+        """Fragments to re-codegen an already-LLVM-dialect ``gpu.module`` that has
+        NO target attached: attach a ROCDL target at ``opt_level`` then emit the
+        device binary.  Used by the custom-LLVM-pass path, which has already run
+        its own ``opt`` pipeline, so codegen runs at ``O=0`` to avoid re-optimizing.
+        """
+        rocdl_opts = self._rocdl_opts(compile_hints=compile_hints, opt_level=opt_level)
+        bin_cli_opts = self._bin_cli_opts(compile_hints=compile_hints)
+        attach_fragment = f"rocdl-attach-target{{{self._format_pass_opts(rocdl_opts)}}}"
+        binary_fragment = f'gpu-module-to-binary{{format=fatbin opts="{" ".join(bin_cli_opts)}"}}'
+        return attach_fragment, binary_fragment
+
+    def _pipeline_parts(self, *, compile_hints: dict) -> Tuple[List[str], str]:
+        chip = self.target.arch
+
+        bin_cli_opts = self._bin_cli_opts(compile_hints=compile_hints)
+        rocdl_opts = self._rocdl_opts(compile_hints=compile_hints, opt_level=2)
+
         pre_binary_fragments = [
             "fly-rewrite-func-signature",
             "fly-canonicalize",

diff --git a/python/flydsl/compiler/external_llvm.py b/python/flydsl/compiler/external_llvm.py
@@ -204,3 +204,269 @@ def run_mlir_opt(*, pass_pipeline: str, input_path: Path, output_path: Path) ->
     finally:
         if tmp_dir_obj is not None:
             tmp_dir_obj.cleanup()
+
+
+def llvm_opt_fingerprint(pipeline: str, plugins: Optional[list] = None) -> str:
+    """Cache fingerprint for a custom LLVM-opt configuration: the pipeline
+    string plus each plugin's path and content hash, so editing a plugin .so
+    (or the pipeline) invalidates cached artifacts."""
+    parts = [f"llvm-opt:{pipeline}"]
+    for p in plugins or []:
+        path = Path(p).expanduser()
+        try:
+            parts.append(f"{path}:{_file_hash(path.resolve())}")
+        except OSError:
+            parts.append(f"{path}:<missing>")
+    return ";".join(parts)
+
+
+def _run_tool(cmd: list, *, prefix: Path, what: str, work_dir: Path) -> None:
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=600, env=_subprocess_env(prefix))
+    except subprocess.TimeoutExpired as exc:
+        raise ExternalLLVMError(
+            f"{what} timed out after 600s.\ncommand: {' '.join(cmd)}\nwork_dir: {work_dir}"
+        ) from exc
+    except subprocess.CalledProcessError as exc:
+        raise ExternalLLVMError(
+            f"{what} failed.\nllvm_dir: {prefix}\ncommand: {' '.join(cmd)}\n"
+            f"work_dir: {work_dir}\nstdout:\n{exc.stdout}\nstderr:\n{exc.stderr}"
+        ) from exc
+
+
+def run_llvm_opt_then_binary(
+    module: ir.Module,
+    *,
+    llvm_ir: str,
+    attach_fragment: str,
+    binary_fragment: str,
+    pipeline: str,
+    plugins: Optional[list] = None,
+    llvm_options: Optional[dict] = None,
+    work_dir: Optional[Path] = None,
+    stage_prefix: str = "llvm_opt",
+) -> None:
+    """Run a custom LLVM new-PM pass pipeline on the device kernel's (pre-link)
+    LLVM IR, then re-codegen the device binary and splice it back into *module*.
+
+    Flow: ``opt --passes`` (with optional ``--load-pass-plugin``) on ``llvm_ir``
+    -> ``mlir-translate --import-llvm`` -> wrap into a ``gpu.module`` -> external
+    ``mlir-opt`` running ``attach_fragment`` (ROCDL target at O=0) then
+    ``binary_fragment`` (``gpu-module-to-binary``) -> replace the in-process
+    ``gpu.module`` with the produced ``gpu.binary``.
+    """
+    prefix = _llvm_dir()
+    opt = _tool(prefix, "opt")
+    mlir_translate = _tool(prefix, "mlir-translate")
+    mlir_opt = _tool(prefix, "mlir-opt")
+
+    gpu_module = _single_top_level_op(module, "gpu.module")
+    name = _symbol_name(gpu_module)
+    data_layout = None
+    if "llvm.data_layout" in gpu_module.attributes:
+        try:
+            data_layout = ir.StringAttr(gpu_module.attributes["llvm.data_layout"]).value
+        except Exception:
+            data_layout = None
+
+    llvm_cli_args = _format_llvm_cli_options(llvm_options) if llvm_options else []
+
+    tmp_dir_obj = None
+    if work_dir is None:
+        tmp_dir_obj = tempfile.TemporaryDirectory(prefix="flydsl_llvm_opt_")
+        work_dir = Path(tmp_dir_obj.name)
+    else:
+        work_dir.mkdir(parents=True, exist_ok=True)
+
+    in_ll = work_dir / f"{stage_prefix}_pre_opt.ll"
+    out_ll = work_dir / f"{stage_prefix}_post_opt.ll"
+    imported_path = work_dir / f"{stage_prefix}_imported.mlir"
+    wrapped_path = work_dir / f"{stage_prefix}_wrapped.mlir"
+    bin_path = work_dir / f"{stage_prefix}_binary.mlir"
+
+    try:
+        in_ll.write_text(llvm_ir, encoding="utf-8")
+
+        plugin_args = [f"--load-pass-plugin={Path(p).expanduser()}" for p in (plugins or [])]
+        _run_tool(
+            [str(opt), str(in_ll), "-S", f"--passes={pipeline}", *plugin_args, *llvm_cli_args, "-o", str(out_ll)],
+            prefix=prefix,
+            what="LLVM opt pass pipeline",
+            work_dir=work_dir,
+        )
+
+        _run_tool(
+            [str(mlir_translate), "--import-llvm", str(out_ll), "-o", str(imported_path)],
+            prefix=prefix,
+            what="mlir-translate --import-llvm",
+            work_dir=work_dir,
+        )
+
+        # Wrap the re-imported LLVM-dialect IR back into a gpu.module (no target;
+        # attach_fragment adds it).  The original gpu.module's data layout is
+        # re-applied; gpu-module-to-binary will produce gpu.binary @<name>.
+        imported = ir.Module.parse(imported_path.read_text(encoding="utf-8"), context=module.context)
+        body = "\n".join(op.operation.get_asm() for op in imported.body.operations)
+        dl_attr = f' attributes {{llvm.data_layout = "{data_layout}"}}' if data_layout else ""
+        wrapped_path.write_text(
+            f"module attributes {{gpu.container_module}} {{\n" f"  gpu.module @{name}{dl_attr} {{\n{body}\n  }}\n}}\n",
+            encoding="utf-8",
+        )
+
+        _run_tool(
+            [
+                str(mlir_opt),
+                str(wrapped_path),
+                f"--pass-pipeline=builtin.module({attach_fragment},{binary_fragment})",
+                *llvm_cli_args,
+                "-o",
+                str(bin_path),
+            ],
+            prefix=prefix,
+            what="external gpu-module-to-binary codegen",
+            work_dir=work_dir,
+        )
+
+        if not bin_path.is_file():
+            raise ExternalLLVMError(f"external codegen did not create output file: {bin_path}")
+        binary_module = ir.Module.parse(bin_path.read_text(encoding="utf-8"), context=module.context)
+        _replace_gpu_module_with_binary_op(module, binary_module)
+    finally:
+        if tmp_dir_obj is not None:
+            tmp_dir_obj.cleanup()
+
+
+# ---------------------------------------------------------------------------
+# Custom-codegen path: fly-llc (IR -> obj with injectable MIR passes) + ld.lld
+# ---------------------------------------------------------------------------
+
+
+def _fly_llc_path() -> Path:
+    raw = env.compile.fly_llc.strip()
+    if raw:
+        return Path(raw).expanduser()
+    cand = _llvm_dir() / "bin" / "fly-llc"
+    if cand.is_file():
+        return cand
+    raise ExternalLLVMError(
+        "fly-llc tool not found: set FLYDSL_COMPILE_FLY_LLC or build fly-llc into <FLYDSL_COMPILE_LLVM_DIR>/bin."
+    )
+
+
+def _lld_path() -> Path:
+    raw = env.compile.lld.strip()
+    if raw:
+        return Path(raw).expanduser()
+    cand = _llvm_dir() / "bin" / "ld.lld"
+    if cand.is_file():
+        return cand
+    raise ExternalLLVMError(
+        "fly-llc codegen path needs ld.lld: set FLYDSL_COMPILE_LLD or place ld.lld in <FLYDSL_COMPILE_LLVM_DIR>/bin."
+    )
+
+
+def fly_llc_codegen_fingerprint(passes: Optional[list] = None, plugins: Optional[list] = None) -> str:
+    """Cache fingerprint for a fly-llc codegen configuration: the pass names plus
+    the fly-llc binary's and each plugin's content hash."""
+    parts = ["fly-llc-codegen:" + ",".join(passes or [])]
+    try:
+        parts.append(_file_hash(_fly_llc_path().resolve()))
+    except OSError:
+        parts.append("<no-fly-llc>")
+    except ExternalLLVMError:
+        parts.append("<no-fly-llc>")
+    for p in plugins or []:
+        path = Path(p).expanduser()
+        try:
+            parts.append(f"{path}:{_file_hash(path.resolve())}")
+        except OSError:
+            parts.append(f"{path}:<missing>")
+    return ";".join(parts)
+
+
+def _gpu_binary_module_text(name: str, target_cpu: str, hsaco: bytes) -> str:
+    """Build a ``builtin.module`` text embedding *hsaco* as a ``gpu.binary @name``
+    (every byte escaped as ``\\XX`` for the MLIR string attribute)."""
+    esc = "".join("\\%02X" % b for b in hsaco)
+    return (
+        "module attributes {gpu.container_module} {\n"
+        f'  gpu.binary @{name} [#gpu.object<#rocdl.target<chip = "{target_cpu}">, kernels = <>, bin = "{esc}">]\n'
+        "}\n"
+    )
+
+
+def run_fly_llc_codegen(
+    module: ir.Module,
+    *,
+    llvm_ir: str,
+    codegen_passes: list,
+    codegen_plugins: Optional[list] = None,
+    target_triple: str,
+    target_cpu: str,
+    work_dir: Optional[Path] = None,
+    stage_prefix: str = "fly_llc",
+) -> None:
+    """Codegen the device kernel's LLVM IR with injectable MIR passes and splice
+    the result back into *module*.
+
+    Flow: ``fly-llc <in.ll> -o <obj> --load=<plugin> --pre-emit-pass=<pass>``
+    (custom MIR passes run pre-emit in the standard codegen) -> ``ld.lld -shared``
+    -> wrap the HSACO bytes into a ``gpu.binary`` -> replace the in-process
+    ``gpu.module``.
+    """
+    fly_llc = _fly_llc_path()
+    lld = _lld_path()
+    prefix = _llvm_dir()
+
+    gpu_module = _single_top_level_op(module, "gpu.module")
+    name = _symbol_name(gpu_module)
+
+    tmp_dir_obj = None
+    if work_dir is None:
+        tmp_dir_obj = tempfile.TemporaryDirectory(prefix="flydsl_fly_llc_")
+        work_dir = Path(tmp_dir_obj.name)
+    else:
+        work_dir.mkdir(parents=True, exist_ok=True)
+
+    in_ll = work_dir / f"{stage_prefix}_pre_codegen.ll"
+    obj = work_dir / f"{stage_prefix}.o"
+    hsaco = work_dir / f"{stage_prefix}.hsaco"
+    bin_mlir = work_dir / f"{stage_prefix}_binary.mlir"
+
+    try:
+        in_ll.write_text(llvm_ir, encoding="utf-8")
+
+        plugin_args = [f"--load={Path(p).expanduser()}" for p in (codegen_plugins or [])]
+        pass_args = [f"--pre-emit-pass={n}" for n in (codegen_passes or [])]
+        _run_tool(
+            [
+                str(fly_llc),
+                str(in_ll),
+                "-o",
+                str(obj),
+                f"-mtriple={target_triple}",
+                f"-mcpu={target_cpu}",
+                *plugin_args,
+                *pass_args,
+            ],
+            prefix=prefix,
+            what="fly-llc codegen",
+            work_dir=work_dir,
+        )
+
+        _run_tool(
+            [str(lld), "-shared", str(obj), "-o", str(hsaco)],
+            prefix=prefix,
+            what="ld.lld HSACO link",
+            work_dir=work_dir,
+        )
+
+        if not hsaco.is_file():
+            raise ExternalLLVMError(f"ld.lld did not create HSACO: {hsaco}")
+        text = _gpu_binary_module_text(name, target_cpu, hsaco.read_bytes())
+        bin_mlir.write_text(text, encoding="utf-8")
+        binary_module = ir.Module.parse(text, context=module.context)
+        _replace_gpu_module_with_binary_op(module, binary_module)
+    finally:
+        if tmp_dir_obj is not None:
+            tmp_dir_obj.cleanup()