Skip to content
Draft

[WIP] #3025

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
16df05e
Unified compilation workflow: CompilableDesign, CallableDesign, @iron…
hunhoffe Apr 6, 2026
6d4af17
Re-apply migration to test_compile_cache_functionality.py
hunhoffe Apr 6, 2026
dd1cbb5
Finish unify-compilation-workflow plan: CMakeLists, __init__, test cl…
hunhoffe Apr 9, 2026
50821eb
Add comprehensive kernel factory wrappers with unit tests
hunhoffe Apr 9, 2026
7522a18
Add trace_config support to CallableDesign and @iron.jit
hunhoffe Apr 9, 2026
619a13b
migrate passthrough_kernel to @iron.jit
hunhoffe Apr 9, 2026
15c1515
migrate vector_reduce_add to @iron.jit
hunhoffe Apr 9, 2026
f2221c6
migrate vector_scalar_mul to @iron.jit
hunhoffe Apr 9, 2026
268d110
migrate eltwise_add and eltwise_mul to @iron.jit
hunhoffe Apr 9, 2026
7092069
Audit fixes: API quality, naming, validation, brevity
hunhoffe Apr 9, 2026
a04222f
Eliminate duplicate global-scanning logic between compilabledesign an…
hunhoffe Apr 9, 2026
8b8d2dc
Fix lower() to warn when call-time kwargs are overridden by pre-bound…
hunhoffe Apr 9, 2026
c57eff5
Fix lower(): call-time Compile[T] kwargs override pre-bound values
hunhoffe Apr 9, 2026
9247b0a
Fix ExternalFunction hash collision causing flaky NPU tests
hunhoffe Apr 9, 2026
4b3cccd
Fix ExternalFunction cache bugs causing flaky NPU tests
hunhoffe Apr 9, 2026
46e60ff
Replace xfail with skip_on_f32_failure fixture for Peano f32 bug
hunhoffe Apr 9, 2026
0de06cd
Revert programming_examples changes to main state
hunhoffe Apr 13, 2026
2d711fe
Merge branch 'main' into unify-compilation-workflow
hunhoffe Apr 13, 2026
05dc9f3
Move compile/jit/kernels code from iron to utils; split kernels into …
hunhoffe Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/modulesXilinx
Submodule modulesXilinx updated 1 files
+7 −21 FindXRT.cmake
3 changes: 0 additions & 3 deletions programming_examples/getting_started/00_memcpy/README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ This design consists of the following:
JIT decorator to compile the design into a binary to run on the NPU, as well as
to describe the program that runs on the CPU (host) that calculates a correct
reference output, verifies and times our NPU design's execution.
* `passThrough.cc`: A C++ vectorized kernel that exposes efficient
vector operations on the AI Engine using the
[AIE API](https://xilinx.github.io/aie_api/index.html).
* `run.lit`: lit tests that run the design on different NPU devices.

## Step-by-Step Instructions
Expand Down
31 changes: 12 additions & 19 deletions programming_examples/getting_started/00_memcpy/memcpy.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
import numpy as np
import argparse
import sys
import os
import time

import aie.iron as iron
from aie.iron import ExternalFunction, jit
from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron import Compile, In, Out, jit
from aie.iron import kernels, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.utils.config import cxx_header_path

#
# Memcpy is designed to use every column's shimDMA in-out pairs
Expand All @@ -29,20 +27,20 @@
# Parameters:
# - use_cache (bool): Use cached MLIR module if available. Defaults to True.
@iron.jit
def my_memcpy(input0, output):
def my_memcpy(
input0: In,
output: Out,
*,
size: Compile[int],
xfr_dtype: Compile[type] = np.int32,
):
# --------------------------------------------------------------------------
# Configuration
# --------------------------------------------------------------------------

xfr_dtype = output.dtype

# Number of channels must be 1 or 2
num_channels = 2

# Transfer size must be a multiple of 1024 and divisible by the number of
# columns and 2 channels per column
size = output.shape[0]

# Number of columns on the device (4 for npu1 and 8 for npu2)
device = iron.get_current_device()
num_columns = device.cols
Expand Down Expand Up @@ -85,12 +83,7 @@ def my_memcpy(input0, output):
# --------------------------------------------------------------------------

# External, binary kernel definition
passthrough_fn = ExternalFunction(
"passThrough",
source_file=os.path.join(os.path.dirname(__file__), "passThrough.cc"),
arg_types=[line_type, line_type, np.int32],
include_dirs=[cxx_header_path()],
)
passthrough_fn = kernels.passthrough(tile_size=line_size, dtype=xfr_dtype)

# Task for the core to perform
def core_fn(of_in, of_out, passThroughLine):
Expand Down Expand Up @@ -195,11 +188,11 @@ def main():

# JIT-compile the kernel then launches the kernel with the given arguments. Future calls
# to the kernel will use the same compiled kernel and loaded code objects
my_memcpy(input0, output_jit)
my_memcpy(input0, output_jit, size=length, xfr_dtype=element_type)

# Measure peformance on the second execution using the JIT cached design
start_time = time.perf_counter()
my_memcpy(input0, output)
my_memcpy(input0, output, size=length, xfr_dtype=element_type)
end_time = time.perf_counter()

elapsed_time = end_time - start_time # seconds
Expand Down
44 changes: 0 additions & 44 deletions programming_examples/getting_started/00_memcpy/passThrough.cc

This file was deleted.

8 changes: 3 additions & 5 deletions programming_examples/getting_started/01_SAXPY/saxpy.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os

import aie.iron as iron
from aie.iron import ExternalFunction
from aie.iron import Compile, ExternalFunction, In, Out
from aie.iron import ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.utils.config import cxx_header_path
Expand All @@ -21,9 +21,7 @@
# Parameters:
# - use_cache (bool): Use cached MLIR module if available. Defaults to True.
@iron.jit
def saxpy(input0, input1, output):
N = input0.shape[0] # Tensor size
element_type = output.dtype
def saxpy(input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
def saxpy(input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]):
def saxpy(
input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]
):


# --------------------------------------------------------------------------
# In-Array Data Movement
Expand Down Expand Up @@ -97,7 +95,7 @@ def main():

# JIT-compile the kernel then launches the kernel with the given arguments. Future calls
# to the kernel will use the same compiled kernel and loaded code objects
saxpy(input0, input1, output)
saxpy(input0, input1, output, N=data_size, element_type=element_type)

# Check the correctness of the result and print any mismatches
ref_vec = [3 * input0[i] + input1[i] for i in range(data_size)]
Expand Down

This file was deleted.

45 changes: 22 additions & 23 deletions programming_examples/getting_started/02_vector_reduce_max/vector_reduce_max_1col.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,23 @@
from ml_dtypes import bfloat16
import numpy as np
import sys
import os

import aie.iron as iron
from aie.iron import ExternalFunction
from aie.iron import Compile, ExternalFunction, In, Out
from aie.iron import ObjectFifo, Program, Runtime, Worker, Buffer
from aie.utils.config import cxx_header_path
from aie.iron.placers import SequentialPlacer
from aie.iron.controlflow import range_
from aie.helpers.util import np_ndarray_type_get_shape
from aie.helpers.dialects.scf import if_, else_
from aie.utils.config import cxx_header_path


# JIT decorator for IRON
# Decorator to compile an IRON kernel into a binary to run on the NPU.
# Parameters:
# - use_cache (bool): Use cached MLIR module if available. Defaults to True.
@iron.jit
def vector_reduce_max(input0, output):
element_type = output.dtype

in_tensor_size = input0.shape[0] # Input tensor size
out_tensor_size = output.shape[0] # Output tensor size
def vector_reduce_max(input0: In, output: Out, *, in_tensor_size: Compile[int], element_type: Compile[type]):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
def vector_reduce_max(input0: In, output: Out, *, in_tensor_size: Compile[int], element_type: Compile[type]):
def vector_reduce_max(
input0: In,
output: Out,
*,
in_tensor_size: Compile[int],
element_type: Compile[type],
):


n_cores = 4
N = 2048
Expand All @@ -43,7 +38,12 @@ def vector_reduce_max(input0, output):
in_ty = np.ndarray[(in_tensor_size,), np.dtype[element_type]]
mem_ty = np.ndarray[(N,), np.dtype[element_type]]
op_ty = np.ndarray[(elems_per_core,), np.dtype[element_type]]
out_ty = np.ndarray[(out_tensor_size,), np.dtype[element_type]]
# DMA transfers must be 4-byte aligned; pad to the minimum element count
# that satisfies this: ceil(4 / itemsize).
_dma_align = 4
_itemsize = np.dtype(element_type).itemsize
out_elems = (_dma_align + _itemsize - 1) // _itemsize
out_ty = np.ndarray[(out_elems,), np.dtype[element_type]]

# Input A and Output C
of_in = ObjectFifo(mem_ty, name="of_in")
Expand All @@ -68,30 +68,33 @@ def vector_reduce_max(input0, output):
names=[f"memA{i}" for i in range(n_cores)],
)

min_val = np.array([bfloat16(float("-inf"))], dtype=element_type)
min_val = np.full(out_elems, bfloat16(float("-inf")), dtype=element_type)
nextC_buffers = []
tmp_buffers = []
for i in range(n_cores):
out_fifos.append(ObjectFifo(out_ty, name=f"memC{i}"))
nextC_buffers.append(
Buffer(
type=np.ndarray[(out_tensor_size,), np.dtype[element_type]],
type=out_ty,
initial_value=min_val,
)
)
tmp_buffers.append(
Buffer(
type=np.ndarray[(out_tensor_size,), np.dtype[element_type]],
type=out_ty,
initial_value=min_val,
)
)
# --------------------------------------------------------------------------
# Task each core will run
# --------------------------------------------------------------------------

# Use ExternalFunction with a 2-element output buffer (4 bytes) for DMA alignment.
# kernels.reduce_max() uses a 1-element output which is only 2 bytes for bfloat16,
# violating the 4-byte DMA alignment requirement.
reduce_max_vector = ExternalFunction(
f"reduce_max_vector_bfloat16",
source_file=os.path.join(os.path.dirname(__file__), "reduce_max_vector.cc"),
"reduce_max_vector_bfloat16",
source_file=cxx_header_path() + "/aie_kernels/aie2/reduce_max.cc",
arg_types=[op_ty, out_ty, np.int32],
include_dirs=[cxx_header_path()],
)
Expand Down Expand Up @@ -183,21 +186,17 @@ def main():
out_size = 4
element_type = bfloat16

assert (
out_size == 4
), "Output buffer must be size 4 (4 bytes = 2 bfloat16 elements)."

in_tensor_size = in_size // element_type(0).nbytes
out_tensor_size = out_size // element_type(0).nbytes

# Construct an input tensor and an output zeroed tensor
# The two tensors are in memory accessible to the NPU
# Allocate output with enough elements for 4-byte DMA alignment.
_dma_align = 4
out_elems = (_dma_align + element_type(0).nbytes - 1) // element_type(0).nbytes
input0 = iron.arange(in_tensor_size, dtype=element_type, device="npu")
output = iron.arange(out_tensor_size, dtype=element_type, device="npu")
output = iron.zeros(out_elems, dtype=element_type, device="npu")

# JIT-compile the kernel then launches the kernel with the given arguments. Future calls
# to the kernel will use the same compiled kernel and loaded code objects
vector_reduce_max(input0, output)
vector_reduce_max(input0, output, in_tensor_size=in_tensor_size, element_type=element_type)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
vector_reduce_max(input0, output, in_tensor_size=in_tensor_size, element_type=element_type)
vector_reduce_max(
input0, output, in_tensor_size=in_tensor_size, element_type=element_type
)


# Check the correctness of the result and print.
# Initialize to -inf so the reference is correct for all-negative inputs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os

import aie.iron as iron
from aie.iron import ExternalFunction, jit
from aie.iron import Compile, ExternalFunction, In, Out, jit
from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.controlflow import range_
Expand All @@ -22,16 +22,13 @@
# Parameters:
# - use_cache (bool): Use cached MLIR module if available. Defaults to True.
@iron.jit
def matrix_multiplication_single_core(input0, input1, output):
def matrix_multiplication_single_core(input0: In, input1: In, output: Out, *, M: Compile[int], K: Compile[int], N: Compile[int], element_type: Compile[type]):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
def matrix_multiplication_single_core(input0: In, input1: In, output: Out, *, M: Compile[int], K: Compile[int], N: Compile[int], element_type: Compile[type]):
def matrix_multiplication_single_core(
input0: In,
input1: In,
output: Out,
*,
M: Compile[int],
K: Compile[int],
N: Compile[int],
element_type: Compile[type]
):

# Problem size
# - matrix0 shapes: (M, K)
# - matrix1 shapes: (K, N)
M, K, N = input0.shape[0], input0.shape[1], input1.shape[1]
m, k, n = 64, 64, 64 # Tile size moved to/from the compute cores via mem tiles
r, s, t = 8, 2, 8 # AIE kernel intrinsic size

element_type = output.dtype

# --------------------------------------------------------------------------
# In-Array Data Movement
# --------------------------------------------------------------------------
Expand Down Expand Up @@ -176,7 +173,7 @@ def main():

# JIT-compile the kernel then launches the kernel with the given arguments. Future calls
# to the kernel will use the same compiled kernel and loaded code objects
matrix_multiplication_single_core(input0, input1, output)
matrix_multiplication_single_core(input0, input1, output, M=M, K=K, N=N, element_type=element_type)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[black] reported by reviewdog 🐶

Suggested change
matrix_multiplication_single_core(input0, input1, output, M=M, K=K, N=N, element_type=element_type)
matrix_multiplication_single_core(
input0, input1, output, M=M, K=K, N=N, element_type=element_type
)


# Check the correctness of the result
e = np.equal(ref_vec.flatten(), output.numpy())
Expand Down
Loading
Loading