Skip to content
Draft

[WIP] #3025

Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
16df05e
Unified compilation workflow: CompilableDesign, CallableDesign, @iron…
hunhoffe Apr 6, 2026
6d4af17
Re-apply migration to test_compile_cache_functionality.py
hunhoffe Apr 6, 2026
dd1cbb5
Finish unify-compilation-workflow plan: CMakeLists, __init__, test cl…
hunhoffe Apr 9, 2026
50821eb
Add comprehensive kernel factory wrappers with unit tests
hunhoffe Apr 9, 2026
7522a18
Add trace_config support to CallableDesign and @iron.jit
hunhoffe Apr 9, 2026
619a13b
migrate passthrough_kernel to @iron.jit
hunhoffe Apr 9, 2026
15c1515
migrate vector_reduce_add to @iron.jit
hunhoffe Apr 9, 2026
f2221c6
migrate vector_scalar_mul to @iron.jit
hunhoffe Apr 9, 2026
268d110
migrate eltwise_add and eltwise_mul to @iron.jit
hunhoffe Apr 9, 2026
7092069
Audit fixes: API quality, naming, validation, brevity
hunhoffe Apr 9, 2026
a04222f
Eliminate duplicate global-scanning logic between compilabledesign an…
hunhoffe Apr 9, 2026
8b8d2dc
Fix lower() to warn when call-time kwargs are overridden by pre-bound…
hunhoffe Apr 9, 2026
c57eff5
Fix lower(): call-time Compile[T] kwargs override pre-bound values
hunhoffe Apr 9, 2026
9247b0a
Fix ExternalFunction hash collision causing flaky NPU tests
hunhoffe Apr 9, 2026
4b3cccd
Fix ExternalFunction cache bugs causing flaky NPU tests
hunhoffe Apr 9, 2026
46e60ff
Replace xfail with skip_on_f32_failure fixture for Peano f32 bug
hunhoffe Apr 9, 2026
0de06cd
Revert programming_examples changes to main state
hunhoffe Apr 13, 2026
2d711fe
Merge branch 'main' into unify-compilation-workflow
hunhoffe Apr 13, 2026
05dc9f3
Move compile/jit/kernels code from iron to utils; split kernels into …
hunhoffe Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/modulesXilinx
Submodule modulesXilinx updated 1 files
+7 −21 FindXRT.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# passthrough_kernel/passthrough_kernel_iron_jit.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates

import numpy as np
import sys

import aie.iron as iron
from aie.iron import Compile, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer


@iron.jit
def passthrough_kernel(
input0: iron.In,
output: iron.Out,
*,
N: Compile[int],
trace_size: Compile[int] = 0,
):
in_dtype = np.uint8
line_size = N
line_type = np.ndarray[(line_size,), np.dtype[in_dtype]]

# Dataflow with ObjectFifos
of_in = ObjectFifo(line_type, name="in")
of_out = ObjectFifo(line_type, name="out")

# External kernel from installed aie_kernels
passthrough_fn = iron.kernels.passthrough(tile_size=line_size, dtype=in_dtype)

# Task for the core to perform
def core_fn(of_in, of_out, passThroughLine):
elemOut = of_out.acquire(1)
elemIn = of_in.acquire(1)
passThroughLine(elemIn, elemOut, line_size)
of_in.release(1)
of_out.release(1)

# Create a worker to perform the task
my_worker = Worker(
core_fn,
[of_in.cons(), of_out.prod(), passthrough_fn],
)

# Runtime operations to move data to/from the AIE-array
rt = Runtime()
with rt.sequence(line_type, line_type) as (a_in, b_out):
if trace_size:
rt.enable_trace(trace_size, workers=[my_worker])
rt.start(my_worker)
rt.fill(of_in.prod(), a_in)
rt.drain(of_out.cons(), b_out, wait=True)

# Place components and generate an MLIR module
return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())


def main():
N = 4096

input0 = iron.arange(N, dtype=np.uint8, device="npu")
output = iron.zeros(N, dtype=np.uint8, device="npu")

passthrough_kernel(input0, output, N=N)

input0.to("cpu")
output.to("cpu")
e = np.equal(input0.numpy(), output.numpy())
errors = np.size(e) - np.count_nonzero(e)

if not errors:
print("\nPASS!\n")
sys.exit(0)
else:
print("\nError count: ", errors)
print("\nfailed.\n")
sys.exit(1)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions programming_examples/basic/passthrough_kernel/run_jit.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// (c) Copyright 2026 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: %run_on_npu1% python3 %S/passthrough_kernel_iron_jit.py
// RUN: %run_on_npu2% python3 %S/passthrough_kernel_iron_jit.py
7 changes: 7 additions & 0 deletions programming_examples/basic/vector_reduce_add/run_jit.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// (c) Copyright 2026 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: %run_on_npu1% python3 %S/vector_reduce_add_iron_jit.py
// RUN: %run_on_npu2% python3 %S/vector_reduce_add_iron_jit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# vector_reduce_add/vector_reduce_add_iron_jit.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates

import numpy as np
import sys

import aie.iron as iron
from aie.iron import Compile, In, Out, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer


@iron.jit
def my_reduce_add(
input_tensor: In,
output_tensor: Out,
*,
N: Compile[int] = 1024,
):
in_ty = np.ndarray[(N,), np.dtype[np.int32]]
out_ty = np.ndarray[(1,), np.dtype[np.int32]]

of_in = ObjectFifo(in_ty, name="in")
of_out = ObjectFifo(out_ty, name="out")

reduce_add_fn = iron.kernels.reduce_add(tile_size=N)

def core_body(of_in, of_out, reduce_add_vector):
elem_out = of_out.acquire(1)
elem_in = of_in.acquire(1)
reduce_add_vector(elem_in, elem_out, N)
of_in.release(1)
of_out.release(1)

worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), reduce_add_fn])

rt = Runtime()
with rt.sequence(in_ty, out_ty) as (a_in, c_out):
rt.start(worker)
rt.fill(of_in.prod(), a_in)
rt.drain(of_out.cons(), c_out, wait=True)

return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())


def main():
N = 1024
input_tensor = iron.randint(0, 100, (N,), dtype=np.int32, device="npu")
output_tensor = iron.zeros((1,), dtype=np.int32, device="npu")

my_reduce_add(input_tensor, output_tensor, N=N)

expected = int(np.sum(input_tensor.numpy()))
computed = int(output_tensor.numpy()[0])

if expected == computed:
print("\nPASS!\n")
sys.exit(0)
else:
print(f"\nFAIL! Expected {expected} but got {computed}")
sys.exit(1)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions programming_examples/basic/vector_scalar_mul/run_jit.lit
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
//
// RUN: %run_on_npu1% python3 %S/vector_scalar_mul_jit.py --iters 10 --warmup 10 -n 4096
// RUN: %run_on_npu2% python3 %S/vector_scalar_mul_jit.py --iters 10 --warmup 10 -n 4096
// RUN: %run_on_npu1% python3 %S/vector_scalar_mul_iron_jit.py
// RUN: %run_on_npu2% python3 %S/vector_scalar_mul_iron_jit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# vector_scalar_mul/vector_scalar_mul_iron_jit.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates

import numpy as np
import sys

import aie.iron as iron
from aie.iron import Compile, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.controlflow import range_


@iron.jit
def vector_scalar_mul(
input0: iron.In,
factor: iron.In,
output: iron.Out,
*,
N: Compile[int],
dtype: Compile[type] = np.int16,
trace_size: Compile[int] = 0,
):
num_sub_vectors = 4
tile_size = N // num_sub_vectors

# Define tensor types
tensor_ty = np.ndarray[(N,), np.dtype[dtype]]
tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
scalar_ty = np.ndarray[(1,), np.dtype[np.int32]]

# External kernel from installed aie_kernels
scale_fn = iron.kernels.scale(tile_size=tile_size, dtype=dtype)

# AIE-array data movement with object fifos
of_in = ObjectFifo(tile_ty, name="in")
of_factor = ObjectFifo(scalar_ty, name="infactor")
of_out = ObjectFifo(tile_ty, name="out")

# Define a task for a compute tile to run
def core_body(of_in, of_factor, of_out, scale_kernel):
elem_factor = of_factor.acquire(1)
for _ in range_(num_sub_vectors):
elem_in = of_in.acquire(1)
elem_out = of_out.acquire(1)
scale_kernel(elem_in, elem_out, elem_factor, tile_size)
of_in.release(1)
of_out.release(1)
of_factor.release(1)

# Create a worker to run the task on a compute tile
worker = Worker(
core_body,
fn_args=[of_in.cons(), of_factor.cons(), of_out.prod(), scale_fn],
)

# Runtime operations to move data to/from the AIE-array
rt = Runtime()
with rt.sequence(tensor_ty, scalar_ty, tensor_ty) as (A, F, C):
if trace_size:
rt.enable_trace(trace_size, workers=[worker])
rt.start(worker)
rt.fill(of_in.prod(), A)
rt.fill(of_factor.prod(), F)
rt.drain(of_out.cons(), C, wait=True)

# Place components and generate an MLIR module
return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())


def main():
N = 2048
dtype = np.int16
scale_factor = 3

input0 = iron.randint(0, 100, (N,), dtype=dtype, device="npu")
factor_tensor = iron.tensor([scale_factor], dtype=np.int32, device="npu")
output = iron.zeros(N, dtype=dtype, device="npu")

vector_scalar_mul(input0, factor_tensor, output, N=N, dtype=dtype)

input0.to("cpu")
output.to("cpu")
expected = (input0.numpy().astype(np.int64) * scale_factor).astype(dtype)
errors = np.sum(output.numpy() != expected)

if not errors:
print("\nPASS!\n")
sys.exit(0)
else:
print(f"\nError count: {errors}")
print("\nfailed.\n")
sys.exit(1)


if __name__ == "__main__":
main()
Loading
Loading