Xilinx · hunhoffe · Apr 6, 2026 · Apr 6, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/programming_examples/basic/passthrough_kernel/passthrough_kernel_iron_jit.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel_iron_jit.py
@@ -0,0 +1,86 @@
+# passthrough_kernel/passthrough_kernel_iron_jit.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+import numpy as np
+import sys
+
+import aie.iron as iron
+from aie.iron import Compile, ObjectFifo, Program, Runtime, Worker
+from aie.iron.placers import SequentialPlacer
+
+
+@iron.jit
+def passthrough_kernel(
+    input0: iron.In,
+    output: iron.Out,
+    *,
+    N: Compile[int],
+    trace_size: Compile[int] = 0,
+):
+    in_dtype = np.uint8
+    line_size = N
+    line_type = np.ndarray[(line_size,), np.dtype[in_dtype]]
+
+    # Dataflow with ObjectFifos
+    of_in = ObjectFifo(line_type, name="in")
+    of_out = ObjectFifo(line_type, name="out")
+
+    # External kernel from installed aie_kernels
+    passthrough_fn = iron.kernels.passthrough(tile_size=line_size, dtype=in_dtype)
+
+    # Task for the core to perform
+    def core_fn(of_in, of_out, passThroughLine):
+        elemOut = of_out.acquire(1)
+        elemIn = of_in.acquire(1)
+        passThroughLine(elemIn, elemOut, line_size)
+        of_in.release(1)
+        of_out.release(1)
+
+    # Create a worker to perform the task
+    my_worker = Worker(
+        core_fn,
+        [of_in.cons(), of_out.prod(), passthrough_fn],
+    )
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(line_type, line_type) as (a_in, b_out):
+        if trace_size:
+            rt.enable_trace(trace_size, workers=[my_worker])
+        rt.start(my_worker)
+        rt.fill(of_in.prod(), a_in)
+        rt.drain(of_out.cons(), b_out, wait=True)
+
+    # Place components and generate an MLIR module
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def main():
+    N = 4096
+
+    input0 = iron.arange(N, dtype=np.uint8, device="npu")
+    output = iron.zeros(N, dtype=np.uint8, device="npu")
+
+    passthrough_kernel(input0, output, N=N)
+
+    input0.to("cpu")
+    output.to("cpu")
+    e = np.equal(input0.numpy(), output.numpy())
+    errors = np.size(e) - np.count_nonzero(e)
+
+    if not errors:
+        print("\nPASS!\n")
+        sys.exit(0)
+    else:
+        print("\nError count: ", errors)
+        print("\nfailed.\n")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/basic/passthrough_kernel/run_jit.lit b/programming_examples/basic/passthrough_kernel/run_jit.lit
@@ -0,0 +1,7 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano
+//
+// RUN: %run_on_npu1% python3 %S/passthrough_kernel_iron_jit.py
+// RUN: %run_on_npu2% python3 %S/passthrough_kernel_iron_jit.py
diff --git a/programming_examples/basic/vector_reduce_add/run_jit.lit b/programming_examples/basic/vector_reduce_add/run_jit.lit
@@ -0,0 +1,7 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano
+//
+// RUN: %run_on_npu1% python3 %S/vector_reduce_add_iron_jit.py
+// RUN: %run_on_npu2% python3 %S/vector_reduce_add_iron_jit.py
diff --git a/programming_examples/basic/vector_reduce_add/vector_reduce_add_iron_jit.py b/programming_examples/basic/vector_reduce_add/vector_reduce_add_iron_jit.py
@@ -0,0 +1,69 @@
+# vector_reduce_add/vector_reduce_add_iron_jit.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+import numpy as np
+import sys
+
+import aie.iron as iron
+from aie.iron import Compile, In, Out, ObjectFifo, Program, Runtime, Worker
+from aie.iron.placers import SequentialPlacer
+
+
+@iron.jit
+def my_reduce_add(
+    input_tensor: In,
+    output_tensor: Out,
+    *,
+    N: Compile[int] = 1024,
+):
+    in_ty = np.ndarray[(N,), np.dtype[np.int32]]
+    out_ty = np.ndarray[(1,), np.dtype[np.int32]]
+
+    of_in = ObjectFifo(in_ty, name="in")
+    of_out = ObjectFifo(out_ty, name="out")
+
+    reduce_add_fn = iron.kernels.reduce_add(tile_size=N)
+
+    def core_body(of_in, of_out, reduce_add_vector):
+        elem_out = of_out.acquire(1)
+        elem_in = of_in.acquire(1)
+        reduce_add_vector(elem_in, elem_out, N)
+        of_in.release(1)
+        of_out.release(1)
+
+    worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), reduce_add_fn])
+
+    rt = Runtime()
+    with rt.sequence(in_ty, out_ty) as (a_in, c_out):
+        rt.start(worker)
+        rt.fill(of_in.prod(), a_in)
+        rt.drain(of_out.cons(), c_out, wait=True)
+
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def main():
+    N = 1024
+    input_tensor = iron.randint(0, 100, (N,), dtype=np.int32, device="npu")
+    output_tensor = iron.zeros((1,), dtype=np.int32, device="npu")
+
+    my_reduce_add(input_tensor, output_tensor, N=N)
+
+    expected = int(np.sum(input_tensor.numpy()))
+    computed = int(output_tensor.numpy()[0])
+
+    if expected == computed:
+        print("\nPASS!\n")
+        sys.exit(0)
+    else:
+        print(f"\nFAIL! Expected {expected} but got {computed}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/basic/vector_scalar_mul/run_jit.lit b/programming_examples/basic/vector_scalar_mul/run_jit.lit
@@ -5,3 +5,5 @@
 //
 // RUN: %run_on_npu1% python3 %S/vector_scalar_mul_jit.py --iters 10 --warmup 10 -n 4096
 // RUN: %run_on_npu2% python3 %S/vector_scalar_mul_jit.py --iters 10 --warmup 10 -n 4096
+// RUN: %run_on_npu1% python3 %S/vector_scalar_mul_iron_jit.py
+// RUN: %run_on_npu2% python3 %S/vector_scalar_mul_iron_jit.py
diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_iron_jit.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_iron_jit.py
@@ -0,0 +1,101 @@
+# vector_scalar_mul/vector_scalar_mul_iron_jit.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+import numpy as np
+import sys
+
+import aie.iron as iron
+from aie.iron import Compile, ObjectFifo, Program, Runtime, Worker
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+
+
+@iron.jit
+def vector_scalar_mul(
+    input0: iron.In,
+    factor: iron.In,
+    output: iron.Out,
+    *,
+    N: Compile[int],
+    dtype: Compile[type] = np.int16,
+    trace_size: Compile[int] = 0,
+):
+    num_sub_vectors = 4
+    tile_size = N // num_sub_vectors
+
+    # Define tensor types
+    tensor_ty = np.ndarray[(N,), np.dtype[dtype]]
+    tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
+    scalar_ty = np.ndarray[(1,), np.dtype[np.int32]]
+
+    # External kernel from installed aie_kernels
+    scale_fn = iron.kernels.scale(tile_size=tile_size, dtype=dtype)
+
+    # AIE-array data movement with object fifos
+    of_in = ObjectFifo(tile_ty, name="in")
+    of_factor = ObjectFifo(scalar_ty, name="infactor")
+    of_out = ObjectFifo(tile_ty, name="out")
+
+    # Define a task for a compute tile to run
+    def core_body(of_in, of_factor, of_out, scale_kernel):
+        elem_factor = of_factor.acquire(1)
+        for _ in range_(num_sub_vectors):
+            elem_in = of_in.acquire(1)
+            elem_out = of_out.acquire(1)
+            scale_kernel(elem_in, elem_out, elem_factor, tile_size)
+            of_in.release(1)
+            of_out.release(1)
+        of_factor.release(1)
+
+    # Create a worker to run the task on a compute tile
+    worker = Worker(
+        core_body,
+        fn_args=[of_in.cons(), of_factor.cons(), of_out.prod(), scale_fn],
+    )
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(tensor_ty, scalar_ty, tensor_ty) as (A, F, C):
+        if trace_size:
+            rt.enable_trace(trace_size, workers=[worker])
+        rt.start(worker)
+        rt.fill(of_in.prod(), A)
+        rt.fill(of_factor.prod(), F)
+        rt.drain(of_out.cons(), C, wait=True)
+
+    # Place components and generate an MLIR module
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def main():
+    N = 2048
+    dtype = np.int16
+    scale_factor = 3
+
+    input0 = iron.randint(0, 100, (N,), dtype=dtype, device="npu")
+    factor_tensor = iron.tensor([scale_factor], dtype=np.int32, device="npu")
+    output = iron.zeros(N, dtype=dtype, device="npu")
+
+    vector_scalar_mul(input0, factor_tensor, output, N=N, dtype=dtype)
+
+    input0.to("cpu")
+    output.to("cpu")
+    expected = (input0.numpy().astype(np.int64) * scale_factor).astype(dtype)
+    errors = np.sum(output.numpy() != expected)
+
+    if not errors:
+        print("\nPASS!\n")
+        sys.exit(0)
+    else:
+        print(f"\nError count: {errors}")
+        print("\nfailed.\n")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()