Xilinx · hunhoffe · Apr 6, 2026 · Apr 6, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -19,9 +19,6 @@ This design consists of the following:
   JIT decorator to compile the design into a binary to run on the NPU, as well as 
   to describe the program that runs on the CPU (host) that calculates a correct 
   reference output, verifies and times our NPU design's execution.
-* `passThrough.cc`: A C++ vectorized kernel that exposes efficient 
-  vector operations on the AI Engine using the 
-  [AIE API](https://xilinx.github.io/aie_api/index.html).
 * `run.lit`: lit tests that run the design on different NPU devices.
 
 ## Step-by-Step Instructions

@@ -7,15 +7,13 @@
 import numpy as np
 import argparse
 import sys
-import os
 import time
 
 import aie.iron as iron
-from aie.iron import ExternalFunction, jit
-from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
+from aie.iron import Compile, In, Out, jit
+from aie.iron import kernels, ObjectFifo, Program, Runtime, Worker
 from aie.iron.placers import SequentialPlacer
 from aie.helpers.taplib.tap import TensorAccessPattern
-from aie.utils.config import cxx_header_path
 
 #
 # Memcpy is designed to use every column's shimDMA in-out pairs
@@ -29,20 +27,20 @@
 # Parameters:
 #     - use_cache (bool): Use cached MLIR module if available. Defaults to True.
 @iron.jit
-def my_memcpy(input0, output):
+def my_memcpy(
+    input0: In,
+    output: Out,
+    *,
+    size: Compile[int],
+    xfr_dtype: Compile[type] = np.int32,
+):
     # --------------------------------------------------------------------------
     # Configuration
     # --------------------------------------------------------------------------
 
-    xfr_dtype = output.dtype
-
     # Number of channels must be 1 or 2
     num_channels = 2
 
-    # Transfer size must be a multiple of 1024 and divisible by the number of
-    # columns and 2 channels per column
-    size = output.shape[0]
-
     # Number of columns on the device (4 for npu1 and 8 for npu2)
     device = iron.get_current_device()
     num_columns = device.cols
@@ -85,12 +83,7 @@ def my_memcpy(input0, output):
     # --------------------------------------------------------------------------
 
     # External, binary kernel definition
-    passthrough_fn = ExternalFunction(
-        "passThrough",
-        source_file=os.path.join(os.path.dirname(__file__), "passThrough.cc"),
-        arg_types=[line_type, line_type, np.int32],
-        include_dirs=[cxx_header_path()],
-    )
+    passthrough_fn = kernels.passthrough(tile_size=line_size, dtype=xfr_dtype)
 
     # Task for the core to perform
     def core_fn(of_in, of_out, passThroughLine):
@@ -195,11 +188,11 @@ def main():
 
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
-    my_memcpy(input0, output_jit)
+    my_memcpy(input0, output_jit, size=length, xfr_dtype=element_type)
 
     # Measure peformance on the second execution using the JIT cached design
     start_time = time.perf_counter()
-    my_memcpy(input0, output)
+    my_memcpy(input0, output, size=length, xfr_dtype=element_type)
     end_time = time.perf_counter()
 
     elapsed_time = end_time - start_time  # seconds

@@ -10,7 +10,7 @@
 import os
 
 import aie.iron as iron
-from aie.iron import ExternalFunction
+from aie.iron import Compile, ExternalFunction, In, Out
 from aie.iron import ObjectFifo, Program, Runtime, Worker
 from aie.iron.placers import SequentialPlacer
 from aie.utils.config import cxx_header_path
@@ -21,9 +21,7 @@
 # Parameters:
 #     - use_cache (bool): Use cached MLIR module if available. Defaults to True.
 @iron.jit
-def saxpy(input0, input1, output):
-    N = input0.shape[0]  # Tensor size
-    element_type = output.dtype
+def saxpy(input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]):
-def saxpy(input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]):
+def saxpy(
+    input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]
+):
-def saxpy(input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]):
+def saxpy(
+    input0: In, input1: In, output: Out, *, N: Compile[int], element_type: Compile[type]
+):
 
     # --------------------------------------------------------------------------
     # In-Array Data Movement
@@ -97,7 +95,7 @@ def main():
 
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
-    saxpy(input0, input1, output)
+    saxpy(input0, input1, output, N=data_size, element_type=element_type)
 
     # Check the correctness of the result and print any mismatches
     ref_vec = [3 * input0[i] + input1[i] for i in range(data_size)]

@@ -7,28 +7,23 @@
 from ml_dtypes import bfloat16
 import numpy as np
 import sys
-import os
 
 import aie.iron as iron
-from aie.iron import ExternalFunction
+from aie.iron import Compile, ExternalFunction, In, Out
 from aie.iron import ObjectFifo, Program, Runtime, Worker, Buffer
+from aie.utils.config import cxx_header_path
 from aie.iron.placers import SequentialPlacer
 from aie.iron.controlflow import range_
 from aie.helpers.util import np_ndarray_type_get_shape
 from aie.helpers.dialects.scf import if_, else_
-from aie.utils.config import cxx_header_path
 
 
 # JIT decorator for IRON
 # Decorator to compile an IRON kernel into a binary to run on the NPU.
 # Parameters:
 #     - use_cache (bool): Use cached MLIR module if available. Defaults to True.
 @iron.jit
-def vector_reduce_max(input0, output):
-    element_type = output.dtype
-
-    in_tensor_size = input0.shape[0]  # Input tensor size
-    out_tensor_size = output.shape[0]  # Output tensor size
+def vector_reduce_max(input0: In, output: Out, *, in_tensor_size: Compile[int], element_type: Compile[type]):
-def vector_reduce_max(input0: In, output: Out, *, in_tensor_size: Compile[int], element_type: Compile[type]):
+def vector_reduce_max(
+    input0: In,
+    output: Out,
+    *,
+    in_tensor_size: Compile[int],
+    element_type: Compile[type],
+):
-def vector_reduce_max(input0: In, output: Out, *, in_tensor_size: Compile[int], element_type: Compile[type]):
+def vector_reduce_max(
+    input0: In,
+    output: Out,
+    *,
+    in_tensor_size: Compile[int],
+    element_type: Compile[type],
+):
 
     n_cores = 4
     N = 2048
@@ -43,7 +38,12 @@ def vector_reduce_max(input0, output):
     in_ty = np.ndarray[(in_tensor_size,), np.dtype[element_type]]
     mem_ty = np.ndarray[(N,), np.dtype[element_type]]
     op_ty = np.ndarray[(elems_per_core,), np.dtype[element_type]]
-    out_ty = np.ndarray[(out_tensor_size,), np.dtype[element_type]]
+    # DMA transfers must be 4-byte aligned; pad to the minimum element count
+    # that satisfies this: ceil(4 / itemsize).
+    _dma_align = 4
+    _itemsize = np.dtype(element_type).itemsize
+    out_elems = (_dma_align + _itemsize - 1) // _itemsize
+    out_ty = np.ndarray[(out_elems,), np.dtype[element_type]]
 
     # Input A and Output C
     of_in = ObjectFifo(mem_ty, name="of_in")
@@ -68,30 +68,33 @@ def vector_reduce_max(input0, output):
         names=[f"memA{i}" for i in range(n_cores)],
     )
 
-    min_val = np.array([bfloat16(float("-inf"))], dtype=element_type)
+    min_val = np.full(out_elems, bfloat16(float("-inf")), dtype=element_type)
     nextC_buffers = []
     tmp_buffers = []
     for i in range(n_cores):
         out_fifos.append(ObjectFifo(out_ty, name=f"memC{i}"))
         nextC_buffers.append(
             Buffer(
-                type=np.ndarray[(out_tensor_size,), np.dtype[element_type]],
+                type=out_ty,
                 initial_value=min_val,
             )
         )
         tmp_buffers.append(
             Buffer(
-                type=np.ndarray[(out_tensor_size,), np.dtype[element_type]],
+                type=out_ty,
                 initial_value=min_val,
             )
         )
     # --------------------------------------------------------------------------
     # Task each core will run
     # --------------------------------------------------------------------------
 
+    # Use ExternalFunction with a 2-element output buffer (4 bytes) for DMA alignment.
+    # kernels.reduce_max() uses a 1-element output which is only 2 bytes for bfloat16,
+    # violating the 4-byte DMA alignment requirement.
     reduce_max_vector = ExternalFunction(
-        f"reduce_max_vector_bfloat16",
-        source_file=os.path.join(os.path.dirname(__file__), "reduce_max_vector.cc"),
+        "reduce_max_vector_bfloat16",
+        source_file=cxx_header_path() + "/aie_kernels/aie2/reduce_max.cc",
         arg_types=[op_ty, out_ty, np.int32],
         include_dirs=[cxx_header_path()],
     )
@@ -183,21 +186,17 @@ def main():
     out_size = 4
     element_type = bfloat16
 
-    assert (
-        out_size == 4
-    ), "Output buffer must be size 4 (4 bytes = 2 bfloat16 elements)."
-
     in_tensor_size = in_size // element_type(0).nbytes
-    out_tensor_size = out_size // element_type(0).nbytes
 
-    # Construct an input tensor and an output zeroed tensor
-    # The two tensors are in memory accessible to the NPU
+    # Allocate output with enough elements for 4-byte DMA alignment.
+    _dma_align = 4
+    out_elems = (_dma_align + element_type(0).nbytes - 1) // element_type(0).nbytes
     input0 = iron.arange(in_tensor_size, dtype=element_type, device="npu")
-    output = iron.arange(out_tensor_size, dtype=element_type, device="npu")
+    output = iron.zeros(out_elems, dtype=element_type, device="npu")
 
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
-    vector_reduce_max(input0, output)
+    vector_reduce_max(input0, output, in_tensor_size=in_tensor_size, element_type=element_type)
-    vector_reduce_max(input0, output, in_tensor_size=in_tensor_size, element_type=element_type)
+    vector_reduce_max(
+        input0, output, in_tensor_size=in_tensor_size, element_type=element_type
+    )
-    vector_reduce_max(input0, output, in_tensor_size=in_tensor_size, element_type=element_type)
+    vector_reduce_max(
+        input0, output, in_tensor_size=in_tensor_size, element_type=element_type
+    )
 
     # Check the correctness of the result and print.
     # Initialize to -inf so the reference is correct for all-negative inputs.

@@ -9,7 +9,7 @@
 import os
 
 import aie.iron as iron
-from aie.iron import ExternalFunction, jit
+from aie.iron import Compile, ExternalFunction, In, Out, jit
 from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
 from aie.iron.placers import SequentialPlacer
 from aie.iron.controlflow import range_
@@ -22,16 +22,13 @@
 # Parameters:
 #     - use_cache (bool): Use cached MLIR module if available. Defaults to True.
 @iron.jit
-def matrix_multiplication_single_core(input0, input1, output):
+def matrix_multiplication_single_core(input0: In, input1: In, output: Out, *, M: Compile[int], K: Compile[int], N: Compile[int], element_type: Compile[type]):
-def matrix_multiplication_single_core(input0: In, input1: In, output: Out, *, M: Compile[int], K: Compile[int], N: Compile[int], element_type: Compile[type]):
+def matrix_multiplication_single_core(
+    input0: In,
+    input1: In,
+    output: Out,
+    *,
+    M: Compile[int],
+    K: Compile[int],
+    N: Compile[int],
+    element_type: Compile[type]
+):
-def matrix_multiplication_single_core(input0: In, input1: In, output: Out, *, M: Compile[int], K: Compile[int], N: Compile[int], element_type: Compile[type]):
+def matrix_multiplication_single_core(
+    input0: In,
+    input1: In,
+    output: Out,
+    *,
+    M: Compile[int],
+    K: Compile[int],
+    N: Compile[int],
+    element_type: Compile[type]
+):
     # Problem size
     # - matrix0 shapes: (M, K)
     # - matrix1 shapes: (K, N)
-    M, K, N = input0.shape[0], input0.shape[1], input1.shape[1]
     m, k, n = 64, 64, 64  # Tile size moved to/from the compute cores via mem tiles
     r, s, t = 8, 2, 8  # AIE kernel intrinsic size
 
-    element_type = output.dtype
-
     # --------------------------------------------------------------------------
     # In-Array Data Movement
     # --------------------------------------------------------------------------
@@ -176,7 +173,7 @@ def main():
 
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
-    matrix_multiplication_single_core(input0, input1, output)
+    matrix_multiplication_single_core(input0, input1, output, M=M, K=K, N=N, element_type=element_type)
-    matrix_multiplication_single_core(input0, input1, output, M=M, K=K, N=N, element_type=element_type)
+    matrix_multiplication_single_core(
+        input0, input1, output, M=M, K=K, N=N, element_type=element_type
+    )
-    matrix_multiplication_single_core(input0, input1, output, M=M, K=K, N=N, element_type=element_type)
+    matrix_multiplication_single_core(
+        input0, input1, output, M=M, K=K, N=N, element_type=element_type
+    )
 
     # Check the correctness of the result
     e = np.equal(ref_vec.flatten(), output.numpy())