diff --git a/.gitignore b/.gitignore
index af53dc4e784..27ebd57e970 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ include/**/Makefile
 lib/**/Makefile
 
 CLAUDE.md
+AGENTS.md
+.codex/
 /platforms/vck190_bare/petalinux/build
 /platforms/vck190_bare/petalinux/components
 /platforms/vck190_bare/petalinux/images
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9ded73daef0..4c3852fe2b1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 - repo: https://github.com/kynan/nbstripout
-  rev: 0.8.0
+  rev: 0.8.1
   hooks:
     - id: nbstripout
       args: [--drop-empty-cells]
@@ -14,7 +14,7 @@ repos:
       stages: [pre-push]
 
 - repo: https://github.com/psf/black
-  rev: 24.10.0
+  rev: 26.3.1
   hooks:
     - id: black
       stages: [pre-push]
diff --git a/include/aie/Conversion/AIEXToEmitC/AIEXToEmitC.h b/include/aie/Conversion/AIEXToEmitC/AIEXToEmitC.h
new file mode 100644
index 00000000000..fc7bb5313f4
--- /dev/null
+++ b/include/aie/Conversion/AIEXToEmitC/AIEXToEmitC.h
@@ -0,0 +1,25 @@
+//===- AIEXToEmitC.h - AIEX to EmitC conversion -----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_CONVERSION_AIEXTOEMITC_AIEXTOEMITC_H
+#define AIE_CONVERSION_AIEXTOEMITC_AIEXTOEMITC_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertAIEXToEmitCPass();
+
+} // namespace xilinx
+
+#endif // AIE_CONVERSION_AIEXTOEMITC_AIEXTOEMITC_H
diff --git a/include/aie/Conversion/Passes.h b/include/aie/Conversion/Passes.h
index 366e85793a6..e94c497000b 100644
--- a/include/aie/Conversion/Passes.h
+++ b/include/aie/Conversion/Passes.h
@@ -13,6 +13,7 @@
 
 #include "aie/Conversion/AIEToConfiguration/AIEToConfiguration.h"
 #include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h"
+#include "aie/Conversion/AIEXToEmitC/AIEXToEmitC.h"
 #include "aie/Conversion/PassesEnums.h.inc"
 
 namespace xilinx {
diff --git a/include/aie/Conversion/Passes.td b/include/aie/Conversion/Passes.td
index d47e6eeb71d..1ba92461d54 100644
--- a/include/aie/Conversion/Passes.td
+++ b/include/aie/Conversion/Passes.td
@@ -104,4 +104,28 @@ def ConvertAIEToControlPackets : Pass<"convert-aie-to-control-packets",
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// AIEXToEmitC
+//===----------------------------------------------------------------------===//
+
+def ConvertAIEXToEmitC : Pass<"convert-aiex-to-emitc", "mlir::ModuleOp"> {
+  let summary = "Convert AIEX dynamic runtime sequence ops to EmitC dialect";
+  let description = [{
+    This pass converts AIEX runtime sequence operations (write32, maskwrite32,
+    sync — including their dynamic operand forms) along with static NPU ops,
+    SCF control flow, and arith operations into EmitC dialect ops. The
+    resulting EmitC IR can be translated to C++ code via translateToCpp()
+    that calls functions from the standalone TxnEncoding.h library to
+    generate TXN binaries at runtime.
+  }];
+  let constructor = "xilinx::createConvertAIEXToEmitCPass()";
+  let dependentDialects = ["mlir::emitc::EmitCDialect",
+                           "mlir::arith::ArithDialect",
+                           "mlir::func::FuncDialect",
+                           "mlir::memref::MemRefDialect",
+                           "mlir::scf::SCFDialect",
+                           "xilinx::AIE::AIEDialect",
+                           "xilinx::AIEX::AIEXDialect"];
+}
+
 #endif // AIE_CONVERSION_PASSES
diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index 4ee03031f03..339bef62c83 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -2228,8 +2228,8 @@ def AIE_BDChainOp: AIE_Op<"bd_chain", [Symbol, SkipAccessibilityCheckTrait]> {
 
 def AIE_RuntimeSequenceOp : AIE_Op<"runtime_sequence", [
   Symbol,
-  NoTerminator, 
-  HasParent<"DeviceOp">, 
+  NoTerminator,
+  HasParent<"DeviceOp">,
 ]> {
   let summary = "Program the configuration co-processor of the AI Engine array";
   let description = [{
@@ -2238,6 +2238,11 @@ def AIE_RuntimeSequenceOp : AIE_Op<"runtime_sequence", [
 
     Typically, these instructions include configuring the data transfers between host and AIE array on the shims.
     The input arguments are arguments passed in from the host at kernel invocation time. This may include buffers on the host.
+
+    Note: This op is NOT `IsolatedFromAbove` — it can reference values defined
+    in the parent `DeviceOp`. However, the EmitC (C++ codegen) path only
+    supports capturing `arith.constant` values from the outer scope. Referencing
+    other external value types will produce errors during the EmitC conversion.
   }];
   let arguments = (
     ins DefaultValuedAttr<SymbolNameAttr, "getDefaultRuntimeSequenceName()">:$sym_name
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
index 81defea0056..a28cb850b5c 100644
--- a/include/aie/Dialect/AIEX/IR/AIEX.td
+++ b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -666,6 +666,17 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   let arguments = (
     ins AnyRankedOrUnrankedMemRef:$memref,
         // NOTE: these are in reverse order: offset3, offset2, ...
+        //
+        // TODO(future PR): tighten these to Variadic<I32> to match the NPU
+        // descriptor register field width. The dynamic runtime_sequence
+        // Python wrapper currently casts SSA i32 values up to i64 at the
+        // IR boundary so the static-IR corpus (which uses `arith.constant
+        // N : i64`) keeps parsing. AIEDmaToNpu's getAsValue already does
+        // width coercion in either direction, so the eventual switch to
+        // i32 will be a one-line change here once all hand-written MLIR
+        // test inputs (test/aiecc, test/npu-xrt, programming_examples)
+        // and any out-of-tree consumers have been migrated off i64 SSA
+        // constants.
         Variadic<I64>:$offsets,
         Variadic<I64>:$sizes,
         Variadic<I64>:$strides,
@@ -762,13 +773,19 @@ def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> {
   let arguments = (
     ins FlatSymbolRefAttr:$buffer,
         UI32Attr:$index,
-        I32Attr:$value
+        OptionalAttr<I32Attr>:$value,
+        Optional<I32>:$dyn_value
   );
   let results = (outs );
-  let assemblyFormat = [{ `(` $buffer `,` $index `,` $value `)` attr-dict
-  }];
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
   let description = [{
-    rtp write operator
+    rtp write operator.
+    When `dyn_value` is provided, it supplies the RTP value at runtime
+    instead of the static `value` attribute.
+  }];
+  let extraClassDeclaration = [{
+    bool hasDynamicValue() { return getDynValue() != nullptr; }
   }];
 }
 
@@ -795,19 +812,20 @@ def AIE_NpuPushQueueOp: AIEX_Op<"npu.push_queue", []> {
 }
 
 // WRITE32
-def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> {
+def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", [AttrSizedOperandSegments]> {
   let summary = "write32 operator";
   let arguments = (
     ins UI32Attr:$address,
         UI32Attr:$value,
         OptionalAttr<FlatSymbolRefAttr>:$buffer,
         OptionalAttr<I32Attr>:$column,
-        OptionalAttr<I32Attr>:$row
+        OptionalAttr<I32Attr>:$row,
+        Optional<AnySignlessInteger>:$dyn_address,
+        Optional<AnySignlessInteger>:$dyn_value
   );
   let results = (outs );
-  let assemblyFormat = [{
-    attr-dict
-  }];
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
   let description = [{
     NPU write32 operator writes a 32bit value to the AIE array.
     If 'buffer' is present then 'address' is interpreted as an offset into the
@@ -816,14 +834,42 @@ def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> {
     into the memory space of aie.tile(column, row).
     If 'buffer' is not present and 'column' and 'row' are not present then
     'address' is interpreted as a full 32-bit address in the AIE array.
+
+    Optionally, SSA values can be provided for 'dyn_address' and 'dyn_value'
+    to enable runtime-parameterized sequences. When dynamic operands are present,
+    the static attributes serve as placeholders (typically 0) and the SSA values
+    are used instead.
+
+    Static syntax: `aiex.npu.write32 {address = 123 : ui32, value = 456 : ui32}`
+    Dynamic syntax: `aiex.npu.write32(%addr, %val) {address = 0 : ui32, value = 0 : ui32} : i32, i32`
   }];
   let extraClassDeclaration = [{
     std::optional<uint32_t> getAbsoluteAddress();
+    bool hasDynamicOperands() { return getDynAddress() != nullptr; }
   }];
+  let builders = [
+    OpBuilder<(ins "uint32_t":$address, "uint32_t":$value,
+               "mlir::FlatSymbolRefAttr":$buffer,
+               "mlir::IntegerAttr":$column, "mlir::IntegerAttr":$row), [{
+      build($_builder, $_state,
+            $_builder.getUI32IntegerAttr(address),
+            $_builder.getUI32IntegerAttr(value),
+            buffer, column, row,
+            /*dyn_address=*/Value(), /*dyn_value=*/Value());
+    }]>,
+    OpBuilder<(ins "mlir::IntegerAttr":$address, "mlir::IntegerAttr":$value,
+               "mlir::FlatSymbolRefAttr":$buffer,
+               "mlir::IntegerAttr":$column, "mlir::IntegerAttr":$row), [{
+      build($_builder, $_state,
+            address, value,
+            buffer, column, row,
+            /*dyn_address=*/Value(), /*dyn_value=*/Value());
+    }]>
+  ];
 }
 
 // MASKWRITE
-def AIE_NpuMaskWrite32Op: AIEX_Op<"npu.maskwrite32", []> {
+def AIE_NpuMaskWrite32Op: AIEX_Op<"npu.maskwrite32", [AttrSizedOperandSegments]> {
   let summary = "Write a masked 32-bit value to the AIE array";
   let arguments = (
     ins UI32Attr:$address,
@@ -831,12 +877,14 @@ def AIE_NpuMaskWrite32Op: AIEX_Op<"npu.maskwrite32", []> {
         UI32Attr:$mask,
         OptionalAttr<FlatSymbolRefAttr>:$buffer,
         OptionalAttr<I32Attr>:$column,
-        OptionalAttr<I32Attr>:$row
+        OptionalAttr<I32Attr>:$row,
+        Optional<AnySignlessInteger>:$dyn_address,
+        Optional<AnySignlessInteger>:$dyn_value,
+        Optional<AnySignlessInteger>:$dyn_mask
   );
   let results = (outs );
-  let assemblyFormat = [{
-    attr-dict
-  }];
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
   let description = [{
     NPU mask write32 operator writes a masked 32bit value to the AIE array.
     If 'buffer' is present then 'address' is interpreted as an offset into the
@@ -845,10 +893,38 @@ def AIE_NpuMaskWrite32Op: AIEX_Op<"npu.maskwrite32", []> {
     into the memory space of aie.tile(column, row).
     If 'buffer' is not present and 'column' and 'row' are not present then
     'address' is interpreted as a full 32-bit address in the AIE array.
+
+    Optionally, SSA values can be provided for 'dyn_address', 'dyn_value', and
+    'dyn_mask' to enable runtime-parameterized sequences.
+
+    Static syntax: `aiex.npu.maskwrite32 {address = 123 : ui32, ...}`
+    Dynamic syntax: `aiex.npu.maskwrite32(%addr, %val, %mask) {address = 0 : ui32, ...} : i32, i32, i32`
   }];
   let extraClassDeclaration = [{
     std::optional<uint32_t> getAbsoluteAddress();
+    bool hasDynamicOperands() { return getDynAddress() != nullptr; }
   }];
+  let builders = [
+    OpBuilder<(ins "uint32_t":$address, "uint32_t":$value, "uint32_t":$mask,
+               "mlir::FlatSymbolRefAttr":$buffer,
+               "mlir::IntegerAttr":$column, "mlir::IntegerAttr":$row), [{
+      build($_builder, $_state,
+            $_builder.getUI32IntegerAttr(address),
+            $_builder.getUI32IntegerAttr(value),
+            $_builder.getUI32IntegerAttr(mask),
+            buffer, column, row,
+            /*dyn_address=*/Value(), /*dyn_value=*/Value(), /*dyn_mask=*/Value());
+    }]>,
+    OpBuilder<(ins "mlir::IntegerAttr":$address, "mlir::IntegerAttr":$value,
+               "mlir::IntegerAttr":$mask,
+               "mlir::FlatSymbolRefAttr":$buffer,
+               "mlir::IntegerAttr":$column, "mlir::IntegerAttr":$row), [{
+      build($_builder, $_state,
+            address, value, mask,
+            buffer, column, row,
+            /*dyn_address=*/Value(), /*dyn_value=*/Value(), /*dyn_mask=*/Value());
+    }]>
+  ];
 }
 
 // BLOCKWRITE
@@ -881,7 +957,7 @@ def AIE_NpuBlockWriteOp: AIEX_Op<"npu.blockwrite", []> {
 }
 
 // OP_SYNC
-def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
+def AIE_NpuSyncOp: AIEX_Op<"npu.sync", [AttrSizedOperandSegments]> {
   let summary = "sync operator";
   let arguments = (
     ins I32Attr:$column,
@@ -889,12 +965,17 @@ def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
         I32Attr:$direction,
         I32Attr:$channel,
         I32Attr:$column_num,
-        I32Attr:$row_num
+        I32Attr:$row_num,
+        Optional<AnySignlessInteger>:$dyn_column,
+        Optional<AnySignlessInteger>:$dyn_row,
+        Optional<AnySignlessInteger>:$dyn_direction,
+        Optional<AnySignlessInteger>:$dyn_channel,
+        Optional<AnySignlessInteger>:$dyn_column_num,
+        Optional<AnySignlessInteger>:$dyn_row_num
   );
   let results = (outs );
-  let assemblyFormat = [{
-    attr-dict
-  }];
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
   let description = [{
     The sync operation blocks execution of the instruction stream until a task-complete token (TCT) is received on `column`, `row`, channel `channel`, direction `direction` (where `0` is `S2MM` and `1` is `MM2S`).
 
@@ -902,7 +983,41 @@ def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
 
     If this operation appears to deadlock, ensure that at least one buffer descriptor is configured to issue a TCT on the channel you expect.
     By default, `dma_memcpy_nd` operations only issue tokens for `S2MM` channels, and `issue_token` must be set to `true` to issue tokens for `MM2S` channels.
+
+    Optionally, SSA values can be provided for all parameters to enable
+    runtime-parameterized sequences.
+
+    Static syntax (unchanged): `aiex.npu.sync {column = 0 : i32, ...}`
+    Dynamic syntax: `aiex.npu.sync(%col, %row, %dir, %chan, %ncol, %nrow) {column = 0 : i32, ...} : i32, i32, i32, i32, i32, i32`
   }];
+  let extraClassDeclaration = [{
+    bool hasDynamicOperands() { return getDynColumn() != nullptr; }
+  }];
+  let builders = [
+    OpBuilder<(ins "int32_t":$column, "int32_t":$row,
+               "int32_t":$direction, "int32_t":$channel,
+               "int32_t":$column_num, "int32_t":$row_num), [{
+      build($_builder, $_state,
+            $_builder.getI32IntegerAttr(column),
+            $_builder.getI32IntegerAttr(row),
+            $_builder.getI32IntegerAttr(direction),
+            $_builder.getI32IntegerAttr(channel),
+            $_builder.getI32IntegerAttr(column_num),
+            $_builder.getI32IntegerAttr(row_num),
+            /*dyn_column=*/Value(), /*dyn_row=*/Value(),
+            /*dyn_direction=*/Value(), /*dyn_channel=*/Value(),
+            /*dyn_column_num=*/Value(), /*dyn_row_num=*/Value());
+    }]>,
+    OpBuilder<(ins "mlir::IntegerAttr":$column, "mlir::IntegerAttr":$row,
+               "mlir::IntegerAttr":$direction, "mlir::IntegerAttr":$channel,
+               "mlir::IntegerAttr":$column_num, "mlir::IntegerAttr":$row_num), [{
+      build($_builder, $_state,
+            column, row, direction, channel, column_num, row_num,
+            /*dyn_column=*/Value(), /*dyn_row=*/Value(),
+            /*dyn_direction=*/Value(), /*dyn_channel=*/Value(),
+            /*dyn_column_num=*/Value(), /*dyn_row_num=*/Value());
+    }]>
+  ];
 }
 
 // XAIE_IO_CUSTOM_OP_BEGIN + 1 (address patch)
@@ -911,14 +1026,17 @@ def AIE_NpuAddressPatchOp: AIEX_Op<"npu.address_patch", []> {
   let arguments = (
     ins UI32Attr:$addr,
         I32Attr:$arg_idx,
-        I32Attr:$arg_plus
+        I32Attr:$arg_plus,
+        Optional<I32>:$dyn_arg_plus
   );
   let results = (outs );
   let assemblyFormat = [{
-    attr-dict
+    (`(` $dyn_arg_plus^ `:` type($dyn_arg_plus) `)`)? attr-dict
   }];
   let description = [{
-    address patch operator
+    address patch operator.
+    When `dyn_arg_plus` is provided, it is used instead of the static
+    `arg_plus` attribute. This enables runtime-parameterized buffer offsets.
   }];
 }
 
@@ -1029,6 +1147,7 @@ def AIE_NpuLoadPdiOp: AIEX_Op<"npu.load_pdi", []> {
   let hasCanonicalizeMethod = 1;
 }
 
+
 def AIE_DMAConfigureTaskOp : AIEX_Op<"dma_configure_task", [HasParent<"AIE::RuntimeSequenceOp">, TileElement]>, Results<(outs Index:$result)> {
   let summary = "Concrete Instantiation of a Buffer Descriptor Chain as a Task on a Channel and Direction on a Tile";
   let description = [{
diff --git a/include/aie/Runtime/TxnEncoding.h b/include/aie/Runtime/TxnEncoding.h
new file mode 100644
index 00000000000..bb4bc5d14e0
--- /dev/null
+++ b/include/aie/Runtime/TxnEncoding.h
@@ -0,0 +1,180 @@
+//===- TxnEncoding.h - Standalone TXN instruction encoding -------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Header-only library for encoding AI Engine TXN (transaction) instructions.
+// This has ZERO dependencies on MLIR or LLVM and can be used standalone in
+// host applications to generate TXN binaries at runtime.
+//
+// The encoding logic is extracted from AIETargetNPU.cpp and is the single
+// source of truth for instruction format, used by both the compiler and
+// generated host code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_RUNTIME_TXNENCODING_H
+#define AIE_RUNTIME_TXNENCODING_H
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace aie_runtime {
+
+// Transaction opcodes - mirroring xaie_txn.h from aie-rt.
+// See aie-rt commit a6196eb, xaiengine/xaie_txn.h.
+enum TxnOpcode : uint32_t {
+  TXN_OPC_WRITE = 0,
+  TXN_OPC_BLOCKWRITE = 1,
+  TXN_OPC_BLOCKSET = 2,
+  TXN_OPC_MASKWRITE = 3,
+  TXN_OPC_MASKPOLL = 4,
+  TXN_OPC_NOOP = 5,
+  TXN_OPC_PREEMPT = 6,
+  TXN_OPC_MASKPOLL_BUSY = 7,
+  TXN_OPC_LOADPDI = 8,
+  TXN_OPC_LOAD_PM_START = 9,
+  TXN_OPC_CREATE_SCRATCHPAD = 10,
+  TXN_OPC_UPDATE_STATE_TABLE = 11,
+  TXN_OPC_UPDATE_REG = 12,
+  TXN_OPC_UPDATE_SCRATCH = 13,
+  TXN_OPC_CONFIG_SHIMDMA_BD = 14,
+  TXN_OPC_CONFIG_SHIMDMA_DMABUF_BD = 15,
+  TXN_OPC_CUSTOM_OP_BEGIN = 1U << 7U,
+  TXN_OPC_TCT = TXN_OPC_CUSTOM_OP_BEGIN,
+  TXN_OPC_DDR_PATCH = TXN_OPC_CUSTOM_OP_BEGIN + 1,
+  TXN_OPC_READ_REGS = TXN_OPC_CUSTOM_OP_BEGIN + 2,
+  TXN_OPC_RECORD_TIMER = TXN_OPC_CUSTOM_OP_BEGIN + 3,
+  TXN_OPC_MERGE_SYNC = TXN_OPC_CUSTOM_OP_BEGIN + 4,
+  TXN_OPC_CUSTOM_OP_NEXT = TXN_OPC_CUSTOM_OP_BEGIN + 5,
+  TXN_OPC_LOAD_PM_END_INTERNAL = 200,
+  TXN_OPC_CUSTOM_OP_MAX = 255,
+};
+
+// Device information for the TXN header.
+struct TxnDeviceInfo {
+  uint8_t major = 0;
+  uint8_t minor = 1;
+  uint8_t devGen = 3; // 3 = NPU (PHX/HWK), 4 = NPU2 (STX/KRK)
+  uint8_t numRows = 6;
+  uint8_t numCols = 5;
+  uint8_t numMemTileRows = 1;
+};
+
+// Append a 6-word write32 instruction.
+inline void txn_append_write32(std::vector<uint32_t> &txn, uint32_t addr,
+                               uint32_t val) {
+  size_t pos = txn.size();
+  txn.resize(pos + 6, 0);
+  txn[pos + 0] = TXN_OPC_WRITE;
+  // txn[pos + 1] is reserved (0)
+  txn[pos + 2] = addr;
+  txn[pos + 3] = 0; // extra bits for reg offset
+  txn[pos + 4] = val;
+  txn[pos + 5] = 6 * sizeof(uint32_t); // operation size
+}
+
+// Append a 7-word maskwrite32 instruction.
+inline void txn_append_maskwrite32(std::vector<uint32_t> &txn, uint32_t addr,
+                                   uint32_t val, uint32_t mask) {
+  size_t pos = txn.size();
+  txn.resize(pos + 7, 0);
+  txn[pos + 0] = TXN_OPC_MASKWRITE;
+  // txn[pos + 1] is reserved (0)
+  txn[pos + 2] = addr;
+  txn[pos + 3] = 0;
+  txn[pos + 4] = val;
+  txn[pos + 5] = mask;
+  txn[pos + 6] = 7 * sizeof(uint32_t); // operation size
+}
+
+// Append a 4-word sync (TCT) instruction.
+inline void txn_append_sync(std::vector<uint32_t> &txn, uint32_t col,
+                            uint32_t row, uint32_t dir, uint32_t chan,
+                            uint32_t ncol, uint32_t nrow) {
+  size_t pos = txn.size();
+  txn.resize(pos + 4, 0);
+  txn[pos + 0] = TXN_OPC_TCT;
+  txn[pos + 1] = 4 * sizeof(uint32_t); // operation size
+  txn[pos + 2] = (dir & 0xff) | ((row & 0xff) << 8) | ((col & 0xff) << 16);
+  txn[pos + 3] =
+      ((nrow & 0xff) << 8) | ((ncol & 0xff) << 16) | ((chan & 0xff) << 24);
+}
+
+// Append a variable-length blockwrite instruction.
+// `data` points to `count` uint32_t words of payload.
+inline void txn_append_blockwrite(std::vector<uint32_t> &txn, uint32_t addr,
+                                  const uint32_t *data, size_t count) {
+  const unsigned headerSize = 4;
+  size_t pos = txn.size();
+  txn.resize(pos + headerSize + count, 0);
+  txn[pos + 0] = TXN_OPC_BLOCKWRITE;
+  // txn[pos + 1] is col/row (set to 0; caller can set if needed)
+  txn[pos + 2] = addr;
+  txn[pos + 3] = static_cast<uint32_t>((headerSize + count) * sizeof(uint32_t));
+  for (size_t i = 0; i < count; ++i)
+    txn[pos + headerSize + i] = data[i];
+}
+
+// Append a 12-word address_patch (DDR_PATCH) instruction.
+inline void txn_append_address_patch(std::vector<uint32_t> &txn, uint32_t addr,
+                                     int32_t arg_idx, uint32_t arg_plus) {
+  size_t pos = txn.size();
+  txn.resize(pos + 12, 0);
+  txn[pos + 0] = TXN_OPC_DDR_PATCH;     // opcode
+  txn[pos + 1] = 12 * sizeof(uint32_t); // payload size in bytes
+  // pos+2..4 are reserved (zero)
+  txn[pos + 5] = 0;    // action (0 = patch)
+  txn[pos + 6] = addr; // register address to patch
+  // pos+7 is reserved (zero)
+  txn[pos + 8] = static_cast<uint32_t>(arg_idx); // buffer argument index
+  // pos+9 is reserved (zero)
+  txn[pos + 10] = arg_plus; // byte offset into buffer
+  // pos+11 is reserved (zero)
+}
+
+// Append a 4-word loadpdi instruction.
+inline void txn_append_loadpdi(std::vector<uint32_t> &txn, uint32_t id,
+                               uint32_t size, uint64_t addr) {
+  size_t pos = txn.size();
+  txn.resize(pos + 4, 0);
+  txn[pos + 0] = TXN_OPC_LOADPDI | (id << 16);
+  txn[pos + 1] = size;
+  txn[pos + 2] = static_cast<uint32_t>(addr);
+  txn[pos + 3] = static_cast<uint32_t>(addr >> 32);
+}
+
+// Append a 1-word preempt instruction.
+inline void txn_append_preempt(std::vector<uint32_t> &txn, uint32_t level) {
+  txn.push_back(TXN_OPC_PREEMPT | (level << 8));
+}
+
+// Reserve 4 placeholder words for the TXN header. Call this BEFORE appending
+// any instructions, so that the header space is already allocated.
+inline void txn_init(std::vector<uint32_t> &txn) { txn.resize(4, 0); }
+
+// Finalize the 4-word TXN header in-place. Call this AFTER all instructions
+// are appended. The first 4 words must have been reserved by txn_init().
+// `op_count` is the number of operations appended.
+inline void txn_prepend_header(std::vector<uint32_t> &txn, uint32_t op_count,
+                               TxnDeviceInfo info = {}) {
+  txn[0] = (static_cast<uint32_t>(info.numRows) << 24) |
+           (static_cast<uint32_t>(info.devGen) << 16) |
+           (static_cast<uint32_t>(info.minor) << 8) |
+           static_cast<uint32_t>(info.major);
+  txn[1] = (static_cast<uint32_t>(info.numMemTileRows) << 8) |
+           static_cast<uint32_t>(info.numCols);
+  txn[2] = op_count;
+  txn[3] = static_cast<uint32_t>(txn.size() * sizeof(uint32_t));
+}
+
+} // namespace aie_runtime
+
+#endif // AIE_RUNTIME_TXNENCODING_H
diff --git a/include/aie/Targets/AIENpuLowering.h b/include/aie/Targets/AIENpuLowering.h
new file mode 100644
index 00000000000..b2a25ab53a4
--- /dev/null
+++ b/include/aie/Targets/AIENpuLowering.h
@@ -0,0 +1,43 @@
+//===- AIENpuLowering.h - Shared NPU lowering pipeline ----------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared NPU lowering pipeline used by both aiecc and aie-translate.
+// This is the canonical pass sequence for lowering high-level DMA task ops
+// (dma_configure_task_for, dma_start_task, dma_await_task, etc.) to flat
+// npu.write32/blockwrite/sync/address_patch ops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_TARGETS_AIENPULOWERING_H
+#define AIE_TARGETS_AIENPULOWERING_H
+
+namespace mlir {
+class PassManager;
+} // namespace mlir
+
+namespace xilinx::AIE {
+
+/// Populate the pass manager with the NPU lowering pipeline.
+///
+/// Adds the following passes:
+/// - AIEMaterializeRuntimeSequences (module-level, skipped if
+///   \p skipMaterialize is true)
+/// - AIEMaterializeBDChains (device-level)
+/// - AIESubstituteShimDMAAllocations (device-level)
+/// - AIEAssignRuntimeSequenceBDIDs (device-level)
+/// - AIEDMATasksToNPU (device-level)
+/// - AIEDmaToNpu (device-level)
+/// - AIELowerSetLock (device-level)
+void populateNpuLoweringPipeline(mlir::PassManager &pm,
+                                 bool skipMaterialize = false);
+
+} // namespace xilinx::AIE
+
+#endif // AIE_TARGETS_AIENPULOWERING_H
diff --git a/include/aie/Targets/AIETargets.h b/include/aie/Targets/AIETargets.h
index 0fcb978d6e0..8642d8df50b 100644
--- a/include/aie/Targets/AIETargets.h
+++ b/include/aie/Targets/AIETargets.h
@@ -73,6 +73,9 @@ mlir::LogicalResult AIETranslateToTargetArch(mlir::ModuleOp module,
                                              llvm::raw_ostream &output,
                                              llvm::StringRef deviceName);
 
+mlir::LogicalResult AIETranslateToCppTxn(mlir::ModuleOp module,
+                                         llvm::raw_ostream &output);
+
 } // namespace AIE
 
 namespace aievec {
diff --git a/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp b/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp
index 4cf93cd63b2..ddcb15ce6c4 100644
--- a/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp
+++ b/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp
@@ -514,7 +514,8 @@ emitTransactionOps(OpBuilder &builder,
       AIEX::NpuAddressPatchOp::create(builder, loc,
                                       builder.getUI32IntegerAttr(patch.addr),
                                       builder.getI32IntegerAttr(patch.argIdx),
-                                      builder.getI32IntegerAttr(patch.argPlus));
+                                      builder.getI32IntegerAttr(patch.argPlus),
+                                      /*dyn_arg_plus=*/Value{});
     } else if (op.cmd.Opcode == 0x6 /*  XAie_TxnOpcode::XAIE_IO_PREEMPT */) {
       auto ui8Ty =
           IntegerType::get(builder.getContext(), 8, IntegerType::Unsigned);
diff --git a/lib/Conversion/AIEXToEmitC/AIEXToEmitC.cpp b/lib/Conversion/AIEXToEmitC/AIEXToEmitC.cpp
new file mode 100644
index 00000000000..c598ee61fdd
--- /dev/null
+++ b/lib/Conversion/AIEXToEmitC/AIEXToEmitC.cpp
@@ -0,0 +1,776 @@
+//===- AIEXToEmitC.cpp - AIEX to EmitC conversion ---------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts AIEX dynamic runtime sequence operations and static NPU
+// ops into EmitC dialect operations. The generated EmitC IR calls functions
+// from TxnEncoding.h, and MLIR's translateToCpp() produces compilable C++ code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "aie/Conversion/AIEXToEmitC/AIEXToEmitC.h"
+
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIEX/IR/AIEXDialect.h"
+
+#include "mlir/Conversion/ArithToEmitC/ArithToEmitC.h"
+#include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/EmitC/Transforms/TypeConversions.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Format.h"
+
+namespace xilinx {
+#define GEN_PASS_DEF_CONVERTAIEXTOEMITC
+#include "aie/Conversion/Passes.h.inc"
+} // namespace xilinx
+
+using namespace mlir;
+using namespace xilinx;
+
+namespace {
+
+// Shared helper: get the emitc opaque type for uint32_t
+emitc::OpaqueType getU32Type(MLIRContext *ctx) {
+  return emitc::OpaqueType::get(ctx, "uint32_t");
+}
+
+// Shared helper: create a uint32_t constant
+Value createU32Constant(OpBuilder &builder, Location loc, uint32_t val) {
+  auto u32Type = getU32Type(builder.getContext());
+  return emitc::ConstantOp::create(
+      builder, loc, u32Type,
+      emitc::OpaqueAttr::get(builder.getContext(), std::to_string(val) + "u"));
+}
+
+// Shared helper: cast an SSA value to uint32_t using static_cast
+Value castToU32(OpBuilder &builder, Location loc, Value val) {
+  auto u32Type = getU32Type(builder.getContext());
+  if (val.getType() == u32Type)
+    return val;
+  return emitc::CastOp::create(builder, loc, u32Type, val);
+}
+
+// Emit: op_count++
+void emitIncrementOpCount(OpBuilder &builder, Location loc) {
+  emitc::VerbatimOp::create(builder, loc, "op_count++;");
+}
+
+// Emit: aie_runtime::txn_append_write32(txn, addr, val)
+void emitTxnWrite32(OpBuilder &builder, Location loc, Value txnVec, Value addr,
+                    Value val) {
+  auto u32 = castToU32(builder, loc, addr);
+  auto u32v = castToU32(builder, loc, val);
+  emitc::CallOpaqueOp::create(builder, loc, TypeRange{},
+                              "aie_runtime::txn_append_write32",
+                              ValueRange{txnVec, u32, u32v});
+  emitIncrementOpCount(builder, loc);
+}
+
+// Emit: aie_runtime::txn_append_maskwrite32(txn, addr, val, mask)
+void emitTxnMaskWrite32(OpBuilder &builder, Location loc, Value txnVec,
+                        Value addr, Value val, Value mask) {
+  auto u32a = castToU32(builder, loc, addr);
+  auto u32v = castToU32(builder, loc, val);
+  auto u32m = castToU32(builder, loc, mask);
+  emitc::CallOpaqueOp::create(builder, loc, TypeRange{},
+                              "aie_runtime::txn_append_maskwrite32",
+                              ValueRange{txnVec, u32a, u32v, u32m});
+  emitIncrementOpCount(builder, loc);
+}
+
+// Emit: aie_runtime::txn_append_sync(txn, col, row, dir, chan, ncol, nrow)
+void emitTxnSync(OpBuilder &builder, Location loc, Value txnVec, Value col,
+                 Value row, Value dir, Value chan, Value ncol, Value nrow) {
+  auto u32col = castToU32(builder, loc, col);
+  auto u32row = castToU32(builder, loc, row);
+  auto u32dir = castToU32(builder, loc, dir);
+  auto u32chan = castToU32(builder, loc, chan);
+  auto u32ncol = castToU32(builder, loc, ncol);
+  auto u32nrow = castToU32(builder, loc, nrow);
+  emitc::CallOpaqueOp::create(
+      builder, loc, TypeRange{}, "aie_runtime::txn_append_sync",
+      ValueRange{txnVec, u32col, u32row, u32dir, u32chan, u32ncol, u32nrow});
+  emitIncrementOpCount(builder, loc);
+}
+
+// Emit: aie_runtime::txn_append_address_patch(txn, addr, arg_idx, arg_plus)
+void emitTxnAddressPatch(OpBuilder &builder, Location loc, Value txnVec,
+                         uint32_t addr, int32_t argIdx, int32_t argPlus,
+                         Value dynArgPlus) {
+  auto addrVal = createU32Constant(builder, loc, addr);
+  auto idxVal = createU32Constant(builder, loc, static_cast<uint32_t>(argIdx));
+  Value plusVal;
+  if (dynArgPlus) {
+    plusVal = dynArgPlus;
+  } else {
+    plusVal = createU32Constant(builder, loc, static_cast<uint32_t>(argPlus));
+  }
+  emitc::CallOpaqueOp::create(builder, loc, TypeRange{},
+                              "aie_runtime::txn_append_address_patch",
+                              ValueRange{txnVec, addrVal, idxVal, plusVal});
+  emitIncrementOpCount(builder, loc);
+}
+
+// Emit: aie_runtime::txn_append_blockwrite(txn, addr, data, count)
+// For blockwrite, we emit the data as an inline array literal.
+void emitTxnBlockWrite(OpBuilder &builder, Location loc, Value txnVec,
+                       uint32_t addr, DenseIntElementsAttr data) {
+  // Build inline array data string: "uint32_t data_N[] = {0x..., 0x..., ...};"
+  std::string arrayStr = "{";
+  llvm::raw_string_ostream ss(arrayStr);
+  bool first = true;
+  for (auto d : data) {
+    if (!first)
+      ss << ", ";
+    uint32_t word = d.getZExtValue();
+    ss << llvm::format("0x%08Xu", word);
+    first = false;
+  }
+  ss << "}";
+
+  // Emit blockwrite via VerbatimOp since arrays don't map cleanly to emitc.
+  std::string stmt = "{\n  static const uint32_t _bd_data[] = " + arrayStr +
+                     ";\n  aie_runtime::txn_append_blockwrite(txn, " +
+                     std::to_string(addr) + "u, _bd_data, " +
+                     std::to_string(data.size()) + ");\n}";
+  emitc::VerbatimOp::create(builder, loc, stmt);
+  emitIncrementOpCount(builder, loc);
+}
+
+// Emit: blockwrite with runtime-provided dynamic word overrides.
+// Creates the static BD array, overrides specific word indices with dynamic
+// SSA values, then calls txn_append_blockwrite. This generalizes the previous
+// single-word override to support multiple dynamic BD words.
+void emitTxnBlockWriteDynamicWords(
+    OpBuilder &builder, Location loc, Value txnVec, uint32_t addr,
+    DenseIntElementsAttr data,
+    ArrayRef<std::pair<uint32_t, Value>> dynamicWords) {
+  auto *ctx = builder.getContext();
+  auto u32Type = getU32Type(ctx);
+  auto arrayType = emitc::ArrayType::get(
+      ctx, SmallVector<int64_t>{static_cast<int64_t>(data.size())}, u32Type);
+
+  std::string arrayInit = "{";
+  llvm::raw_string_ostream ss(arrayInit);
+  bool first = true;
+  for (auto d : data) {
+    if (!first)
+      ss << ", ";
+    uint32_t word = d.getZExtValue();
+    ss << llvm::format("0x%08Xu", word);
+    first = false;
+  }
+  ss << "}";
+
+  auto arrayVar = emitc::VariableOp::create(
+      builder, loc, arrayType, emitc::OpaqueAttr::get(ctx, arrayInit));
+
+  for (auto &[wordIdx, dynVal] : dynamicWords) {
+    auto indexConst = createU32Constant(builder, loc, wordIdx);
+    auto elem = emitc::SubscriptOp::create(
+        builder, loc, cast<TypedValue<emitc::ArrayType>>(arrayVar.getResult()),
+        ValueRange{indexConst});
+    emitc::AssignOp::create(builder, loc, elem.getResult(),
+                            castToU32(builder, loc, dynVal));
+  }
+
+  auto addrVal = createU32Constant(builder, loc, addr);
+  auto countVal =
+      createU32Constant(builder, loc, static_cast<uint32_t>(data.size()));
+  emitc::CallOpaqueOp::create(
+      builder, loc, TypeRange{}, "aie_runtime::txn_append_blockwrite",
+      ValueRange{txnVec, addrVal, arrayVar.getResult(), countVal});
+  emitIncrementOpCount(builder, loc);
+}
+
+/// The main pass that converts AIEX runtime sequence ops to C++-emittable IR.
+/// AIEX TXN ops are lowered directly to EmitC calls, while the surrounding
+/// arith/scf structure is cloned as regular MLIR and then lowered via upstream
+/// convert-arith-to-emitc / convert-scf-to-emitc patterns.
+struct ConvertAIEXToEmitCPass
+    : xilinx::impl::ConvertAIEXToEmitCBase<ConvertAIEXToEmitCPass> {
+  void runOnOperation() override {
+    auto moduleOp = getOperation();
+    auto *ctx = &getContext();
+
+    // Collect all RuntimeSequenceOps and their parent DeviceOps.
+    struct SeqInfo {
+      AIE::RuntimeSequenceOp seq;
+      AIE::DeviceOp device;
+    };
+    SmallVector<SeqInfo> sequences;
+    moduleOp.walk([&](AIE::RuntimeSequenceOp seq) {
+      auto device = seq->getParentOfType<AIE::DeviceOp>();
+      sequences.push_back({seq, device});
+    });
+
+    if (sequences.empty())
+      return;
+
+    // We'll build a new module body with emitc ops.
+    OpBuilder builder(ctx);
+
+    for (auto &[seqOp, deviceOp] : sequences) {
+      if (!deviceOp) {
+        seqOp.emitOpError("must be nested inside an aie.device");
+        signalPassFailure();
+        return;
+      }
+
+      std::string seqName = seqOp.getSymName().str();
+
+      if (failed(createGeneratedFunction(builder, moduleOp, seqOp, deviceOp,
+                                         seqName))) {
+        signalPassFailure();
+        return;
+      }
+    }
+
+    SmallVector<Operation *> toErase;
+    for (auto &op : moduleOp.getBody()->getOperations()) {
+      if (!isa<emitc::FuncOp>(op) && !isa<emitc::IncludeOp>(op))
+        toErase.push_back(&op);
+    }
+    for (auto *op : llvm::reverse(toErase))
+      op->erase();
+
+    // Add the #include at the top of the module.
+    builder.setInsertionPointToStart(moduleOp.getBody());
+    auto includeOp = emitc::IncludeOp::create(builder, moduleOp.getLoc(),
+                                              "aie/Runtime/TxnEncoding.h",
+                                              /*is_standard=*/false);
+    // Also include standard headers.
+    builder.setInsertionPointAfter(includeOp);
+    emitc::IncludeOp::create(builder, moduleOp.getLoc(), "cstdint",
+                             /*is_standard=*/true);
+    emitc::IncludeOp::create(builder, moduleOp.getLoc(), "cstddef",
+                             /*is_standard=*/true);
+    emitc::IncludeOp::create(builder, moduleOp.getLoc(), "vector",
+                             /*is_standard=*/true);
+
+    TypeConverter typeConverter;
+    typeConverter.addConversion([](Type type) { return type; });
+    populateEmitCSizeTTypeConversions(typeConverter);
+
+    RewritePatternSet patterns(ctx);
+    // Expand arith.ceildivsi / arith.floordivsi into basic arith ops
+    // (cmpi/select/divsi) that the upstream ArithToEmitC patterns know how
+    // to lower. The Python front-end naturally produces these ops from
+    // expressions like `M // m` on SSA i32 runtime-sequence arguments.
+    arith::populateCeilFloorDivExpandOpsPatterns(patterns);
+    populateArithToEmitCPatterns(typeConverter, patterns);
+    populateSCFToEmitCConversionPatterns(patterns, typeConverter);
+
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<emitc::EmitCDialect, func::FuncDialect>();
+    target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
+    target.addIllegalDialect<arith::ArithDialect, scf::SCFDialect>();
+
+    if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+
+    if (failed(lowerRemainingUnrealizedCasts(moduleOp))) {
+      signalPassFailure();
+      return;
+    }
+  }
+
+private:
+  LogicalResult createGeneratedFunction(OpBuilder &builder, ModuleOp moduleOp,
+                                        AIE::RuntimeSequenceOp seqOp,
+                                        AIE::DeviceOp deviceOp,
+                                        StringRef seqName) {
+    Location loc = seqOp.getLoc();
+    Block &entryBlock = seqOp.getBody().front();
+
+    SmallVector<Type> paramTypes;
+    for (auto arg : entryBlock.getArguments()) {
+      if (!isa<MemRefType, UnrankedMemRefType>(arg.getType()))
+        paramTypes.push_back(arg.getType());
+    }
+
+    auto txnVecType =
+        emitc::OpaqueType::get(moduleOp.getContext(), "std::vector<uint32_t>");
+    auto funcType = FunctionType::get(moduleOp.getContext(), paramTypes,
+                                      TypeRange{txnVecType});
+
+    builder.setInsertionPointToEnd(moduleOp.getBody());
+    std::string funcName = "generate_txn_" + seqName.str();
+    auto funcOp = emitc::FuncOp::create(builder, loc, funcName, funcType);
+    funcOp.setSpecifiersAttr(builder.getStrArrayAttr({"inline"}));
+    Block *funcBlock = funcOp.addEntryBlock();
+    OpBuilder funcBuilder = OpBuilder::atBlockBegin(funcBlock);
+
+    emitc::VerbatimOp::create(funcBuilder, loc, "std::vector<uint32_t> txn;");
+    emitc::VerbatimOp::create(funcBuilder, loc, "aie_runtime::txn_init(txn);");
+    emitc::VerbatimOp::create(funcBuilder, loc, "uint32_t op_count = 0;");
+    Value txnVec =
+        emitc::LiteralOp::create(funcBuilder, loc, txnVecType, "txn");
+
+    IRMapping mapping;
+    unsigned paramIdx = 0;
+    for (auto arg : entryBlock.getArguments()) {
+      if (isa<MemRefType, UnrankedMemRefType>(arg.getType()))
+        continue;
+      mapping.map(arg, funcBlock->getArgument(paramIdx++));
+    }
+
+    if (failed(cloneExternalConstants(seqOp, funcBuilder, mapping)))
+      return failure();
+
+    if (failed(cloneBlock(entryBlock, funcBuilder, mapping, txnVec)))
+      return failure();
+
+    const auto &tm = deviceOp.getTargetModel();
+    uint8_t devGen = llvm::isa<AIE::BaseNPU2TargetModel>(tm) ? 4 : 3;
+    emitc::VerbatimOp::create(
+        funcBuilder, loc,
+        "aie_runtime::txn_prepend_header(txn, op_count, {0, 1, " +
+            std::to_string(devGen) + ", " + std::to_string(tm.rows()) + ", " +
+            std::to_string(tm.columns()) + ", " +
+            std::to_string(tm.getNumMemTileRows()) + "});");
+    emitc::ReturnOp::create(funcBuilder, loc, txnVec);
+    return success();
+  }
+
+  LogicalResult cloneExternalConstants(AIE::RuntimeSequenceOp seqOp,
+                                       OpBuilder &builder, IRMapping &mapping) {
+    llvm::DenseSet<Operation *> clonedConstants;
+    WalkResult prescan = seqOp.walk([&](Operation *innerOp) -> WalkResult {
+      for (Value operand : innerOp->getOperands()) {
+        if (mapping.contains(operand))
+          continue;
+        Operation *defOp = operand.getDefiningOp();
+        if (!defOp || seqOp->isAncestor(defOp))
+          continue;
+        auto constOp = dyn_cast<arith::ConstantOp>(defOp);
+        if (!constOp) {
+          innerOp->emitOpError(
+              "uses an external value that is not an arith.constant");
+          return WalkResult::interrupt();
+        }
+        if (clonedConstants.insert(defOp).second) {
+          Operation *newConst = builder.clone(*defOp, mapping);
+          mapping.map(constOp.getResult(), newConst->getResult(0));
+        }
+      }
+      return WalkResult::advance();
+    });
+    return prescan.wasInterrupted() ? failure() : success();
+  }
+
+  LogicalResult cloneBlock(Block &block, OpBuilder &builder, IRMapping &mapping,
+                           Value txnVec) {
+    for (auto it = block.begin(), e = block.end(); it != e; ++it) {
+      Operation *op = &*it;
+
+      auto blockWrite = dyn_cast<AIEX::NpuBlockWriteOp>(op);
+      if (!blockWrite) {
+        if (failed(cloneOp(builder, op, mapping, txnVec)))
+          return failure();
+        continue;
+      }
+
+      uint32_t blockAddr = blockWrite.getAddress();
+      if (auto absAddr = blockWrite.getAbsoluteAddress())
+        blockAddr = *absAddr;
+
+      // Scan forward for dynamic NpuWrite32Ops whose constant addresses
+      // target BD words within this blockwrite's range (blockAddr to
+      // blockAddr + 28), terminated by an NpuAddressPatchOp at
+      // blockAddr + 4.
+      //
+      // Pure (side-effect-free) ops are allowed in between, since the
+      // dma-to-npu lowering interleaves arith.constant / andi / shli /
+      // ori ops between the override write32s to compute their dynamic
+      // values. Any non-pure op or any other AIEX TXN op aborts the
+      // fusion attempt and we fall back to per-op emission.
+      auto scanIt = std::next(it);
+      SmallVector<std::pair<uint32_t, AIEX::NpuWrite32Op>> dynWrite32s;
+      AIEX::NpuAddressPatchOp matchedPatch = nullptr;
+
+      while (scanIt != e) {
+        Operation *cur = &*scanIt;
+
+        if (auto patch = dyn_cast<AIEX::NpuAddressPatchOp>(cur)) {
+          if (patch.getAddr() == blockAddr + 4)
+            matchedPatch = patch;
+          break;
+        }
+
+        if (auto w32 = dyn_cast<AIEX::NpuWrite32Op>(cur)) {
+          if (!w32.hasDynamicOperands())
+            break;
+          auto addrConst =
+              w32.getDynAddress().getDefiningOp<arith::ConstantOp>();
+          auto addrAttr =
+              addrConst ? dyn_cast<IntegerAttr>(addrConst.getValue()) : nullptr;
+          if (!addrAttr)
+            break;
+          uint64_t w32Addr = addrAttr.getValue().getZExtValue();
+          if (w32Addr < blockAddr || w32Addr > blockAddr + 28 ||
+              (w32Addr - blockAddr) % 4 != 0)
+            break;
+          uint32_t wordIdx = static_cast<uint32_t>((w32Addr - blockAddr) / 4);
+          dynWrite32s.push_back({wordIdx, w32});
+          ++scanIt;
+          continue;
+        }
+
+        // Any other AIEX TXN op between blockwrite and its address_patch
+        // means this blockwrite is not part of a BD-with-overrides pattern.
+        if (cur->getDialect() && cur->getDialect()->getNamespace() == "aiex")
+          break;
+
+        // Allow only pure helper ops to be skipped; bail on anything else
+        // to preserve side effects.
+        if (!isPure(cur))
+          break;
+
+        ++scanIt;
+      }
+
+      if (!matchedPatch || dynWrite32s.empty()) {
+        if (failed(cloneOp(builder, op, mapping, txnVec)))
+          return failure();
+        continue;
+      }
+
+      auto data = blockWrite.getDataWords();
+      if (!data)
+        return failure();
+
+      // Pre-clone all source ops between the source blockwrite (exclusive)
+      // and the matched address_patch (exclusive), except for the consumed
+      // override NpuWrite32Ops themselves. This ensures each override's
+      // dyn_value SSA ref is materialized in the new IR before we emit
+      // the consolidated `txn_append_blockwrite` call (which assigns those
+      // values into _bd_data[wordIdx]).
+      llvm::SmallPtrSet<Operation *, 8> consumed;
+      for (auto &[idx, w32] : dynWrite32s)
+        consumed.insert(w32.getOperation());
+
+      for (auto innerIt = std::next(it); innerIt != scanIt; ++innerIt) {
+        Operation *innerOp = &*innerIt;
+        if (consumed.contains(innerOp))
+          continue;
+        if (failed(cloneOp(builder, innerOp, mapping, txnVec)))
+          return failure();
+      }
+
+      SmallVector<std::pair<uint32_t, Value>> dynamicWords;
+      for (auto &[wordIdx, w32] : dynWrite32s) {
+        dynamicWords.push_back(
+            {wordIdx, mapping.lookupOrDefault(w32.getDynValue())});
+      }
+
+      emitTxnBlockWriteDynamicWords(builder, blockWrite.getLoc(), txnVec,
+                                    blockAddr, data, dynamicWords);
+      Value dynPlus = matchedPatch.getDynArgPlus();
+      if (dynPlus)
+        dynPlus = mapping.lookupOrDefault(dynPlus);
+      emitTxnAddressPatch(builder, matchedPatch.getLoc(), txnVec,
+                          matchedPatch.getAddr(), matchedPatch.getArgIdx(),
+                          matchedPatch.getArgPlus(), dynPlus);
+
+      // All overrides + intervening arith ops + addrPatch are consumed.
+      // Set it = scanIt so the loop's ++it advances past the addrPatch.
+      it = scanIt;
+    }
+    return success();
+  }
+
+  LogicalResult cloneScfFor(OpBuilder &builder, scf::ForOp forOp,
+                            IRMapping &mapping, Value txnVec) {
+    Location loc = forOp.getLoc();
+    SmallVector<Value> initArgs;
+    for (Value initArg : forOp.getInitArgs())
+      initArgs.push_back(mapping.lookupOrDefault(initArg));
+
+    auto newFor = scf::ForOp::create(
+        builder, loc, mapping.lookupOrDefault(forOp.getLowerBound()),
+        mapping.lookupOrDefault(forOp.getUpperBound()),
+        mapping.lookupOrDefault(forOp.getStep()), initArgs);
+
+    for (auto [oldResult, newResult] :
+         llvm::zip(forOp.getResults(), newFor.getResults()))
+      mapping.map(oldResult, newResult);
+
+    Block &newBody = newFor.getRegion().front();
+    newBody.getOperations().clear();
+    IRMapping nestedMapping = mapping;
+    nestedMapping.map(forOp.getInductionVar(), newFor.getInductionVar());
+    for (auto [oldArg, newArg] :
+         llvm::zip(forOp.getRegionIterArgs(), newFor.getRegionIterArgs()))
+      nestedMapping.map(oldArg, newArg);
+
+    OpBuilder bodyBuilder = OpBuilder::atBlockBegin(&newBody);
+    return cloneBlock(forOp.getRegion().front(), bodyBuilder, nestedMapping,
+                      txnVec);
+  }
+
+  LogicalResult cloneScfIf(OpBuilder &builder, scf::IfOp ifOp,
+                           IRMapping &mapping, Value txnVec) {
+    Location loc = ifOp.getLoc();
+    auto newIf = scf::IfOp::create(builder, loc, ifOp.getResultTypes(),
+                                   mapping.lookupOrDefault(ifOp.getCondition()),
+                                   !ifOp.getElseRegion().empty());
+
+    for (auto [oldResult, newResult] :
+         llvm::zip(ifOp.getResults(), newIf.getResults()))
+      mapping.map(oldResult, newResult);
+
+    Block &thenBlock = newIf.getThenRegion().front();
+    thenBlock.getOperations().clear();
+    OpBuilder thenBuilder = OpBuilder::atBlockBegin(&thenBlock);
+    if (failed(cloneBlock(ifOp.getThenRegion().front(), thenBuilder, mapping,
+                          txnVec)))
+      return failure();
+
+    if (!ifOp.getElseRegion().empty()) {
+      Block &elseBlock = newIf.getElseRegion().front();
+      elseBlock.getOperations().clear();
+      OpBuilder elseBuilder = OpBuilder::atBlockBegin(&elseBlock);
+      if (failed(cloneBlock(ifOp.getElseRegion().front(), elseBuilder, mapping,
+                            txnVec)))
+        return failure();
+    }
+
+    return success();
+  }
+
+  /// Clone a source operation into the generated function, converting only the
+  /// AIEX TXN ops directly to EmitC.
+  LogicalResult cloneOp(OpBuilder &builder, Operation *op, IRMapping &mapping,
+                        Value txnVec) {
+    Location opLoc = op->getLoc();
+
+    // AIEX write32 - handles both static and dynamic forms.
+    if (auto write32 = dyn_cast<AIEX::NpuWrite32Op>(op)) {
+      Value addrVal, valVal;
+      if (write32.hasDynamicOperands()) {
+        addrVal = mapping.lookupOrDefault(write32.getDynAddress());
+        valVal = mapping.lookupOrDefault(write32.getDynValue());
+      } else {
+        uint32_t addr = write32.getAddress();
+        if (auto absAddr = write32.getAbsoluteAddress())
+          addr = *absAddr;
+        addrVal = createU32Constant(builder, opLoc, addr);
+        valVal = createU32Constant(builder, opLoc, write32.getValue());
+      }
+      emitTxnWrite32(builder, opLoc, txnVec, addrVal, valVal);
+      return success();
+    }
+
+    // AIEX maskwrite32 - handles both static and dynamic forms.
+    if (auto maskWrite = dyn_cast<AIEX::NpuMaskWrite32Op>(op)) {
+      Value addrVal, valVal, maskVal;
+      if (maskWrite.hasDynamicOperands()) {
+        addrVal = mapping.lookupOrDefault(maskWrite.getDynAddress());
+        valVal = mapping.lookupOrDefault(maskWrite.getDynValue());
+        maskVal = mapping.lookupOrDefault(maskWrite.getDynMask());
+      } else {
+        uint32_t addr = maskWrite.getAddress();
+        if (auto absAddr = maskWrite.getAbsoluteAddress())
+          addr = *absAddr;
+        addrVal = createU32Constant(builder, opLoc, addr);
+        valVal = createU32Constant(builder, opLoc, maskWrite.getValue());
+        maskVal = createU32Constant(builder, opLoc, maskWrite.getMask());
+      }
+      emitTxnMaskWrite32(builder, opLoc, txnVec, addrVal, valVal, maskVal);
+      return success();
+    }
+
+    // AIEX sync - handles both static and dynamic forms.
+    if (auto syncOp = dyn_cast<AIEX::NpuSyncOp>(op)) {
+      Value col, row, dir, chan, ncol, nrow;
+      if (syncOp.hasDynamicOperands()) {
+        col = mapping.lookupOrDefault(syncOp.getDynColumn());
+        row = mapping.lookupOrDefault(syncOp.getDynRow());
+        dir = mapping.lookupOrDefault(syncOp.getDynDirection());
+        chan = mapping.lookupOrDefault(syncOp.getDynChannel());
+        ncol = mapping.lookupOrDefault(syncOp.getDynColumnNum());
+        nrow = mapping.lookupOrDefault(syncOp.getDynRowNum());
+      } else {
+        col = createU32Constant(builder, opLoc, syncOp.getColumn());
+        row = createU32Constant(builder, opLoc, syncOp.getRow());
+        dir = createU32Constant(builder, opLoc,
+                                static_cast<uint32_t>(syncOp.getDirection()));
+        chan = createU32Constant(builder, opLoc, syncOp.getChannel());
+        ncol = createU32Constant(builder, opLoc, syncOp.getColumnNum());
+        nrow = createU32Constant(builder, opLoc, syncOp.getRowNum());
+      }
+      emitTxnSync(builder, opLoc, txnVec, col, row, dir, chan, ncol, nrow);
+      return success();
+    }
+
+    if (auto addrPatch = dyn_cast<AIEX::NpuAddressPatchOp>(op)) {
+      Value dynPlus = addrPatch.getDynArgPlus();
+      if (dynPlus)
+        dynPlus = mapping.lookupOrDefault(dynPlus);
+      emitTxnAddressPatch(builder, opLoc, txnVec, addrPatch.getAddr(),
+                          addrPatch.getArgIdx(), addrPatch.getArgPlus(),
+                          dynPlus);
+      return success();
+    }
+
+    if (auto blockWrite = dyn_cast<AIEX::NpuBlockWriteOp>(op)) {
+      uint32_t addr = blockWrite.getAddress();
+      if (auto absAddr = blockWrite.getAbsoluteAddress())
+        addr = *absAddr;
+      auto data = blockWrite.getDataWords();
+      if (!data)
+        return failure();
+      emitTxnBlockWrite(builder, opLoc, txnVec, addr, data);
+      return success();
+    }
+
+    if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+      return cloneScfFor(builder, forOp, mapping, txnVec);
+    }
+
+    if (auto ifOp = dyn_cast<scf::IfOp>(op)) {
+      return cloneScfIf(builder, ifOp, mapping, txnVec);
+    }
+
+    if (auto getGlobal = dyn_cast<memref::GetGlobalOp>(op)) {
+      for (Operation *user : getGlobal->getUsers()) {
+        if (!isa<AIEX::NpuBlockWriteOp>(user)) {
+          return op->emitOpError(
+              "unsupported memref.get_global use in TXN EmitC conversion");
+        }
+      }
+      // Blockwrite lowering consumes the referenced data directly; cloning the
+      // memref.get_global would leave a dangling reference once module-level
+      // memref.global ops are erased.
+      return success();
+    }
+
+    if (auto minOp = dyn_cast<arith::MinSIOp>(op)) {
+      Value lhs = mapping.lookupOrDefault(minOp.getLhs());
+      Value rhs = mapping.lookupOrDefault(minOp.getRhs());
+      auto cmp = builder.create<arith::CmpIOp>(opLoc, arith::CmpIPredicate::slt,
+                                               lhs, rhs);
+      auto select =
+          builder.create<arith::SelectOp>(opLoc, cmp.getResult(), lhs, rhs);
+      mapping.map(minOp.getResult(), select.getResult());
+      return success();
+    }
+
+    if (auto maxOp = dyn_cast<arith::MaxSIOp>(op)) {
+      Value lhs = mapping.lookupOrDefault(maxOp.getLhs());
+      Value rhs = mapping.lookupOrDefault(maxOp.getRhs());
+      auto cmp = builder.create<arith::CmpIOp>(opLoc, arith::CmpIPredicate::sgt,
+                                               lhs, rhs);
+      auto select =
+          builder.create<arith::SelectOp>(opLoc, cmp.getResult(), lhs, rhs);
+      mapping.map(maxOp.getResult(), select.getResult());
+      return success();
+    }
+
+    if (isa<AIE::EndOp>(op))
+      return success();
+
+    if (isa<AIEX::NpuControlPacketOp, AIEX::NpuPushQueueOp, AIEX::NpuWriteBdOp>(
+            op))
+      return op->emitOpError("not supported in dynamic TXN C++ generation");
+
+    if (op->getNumRegions() != 0)
+      return op->emitOpError(
+          "unsupported region operation in TXN EmitC conversion");
+
+    Operation *cloned = builder.clone(*op, mapping);
+    for (auto [oldResult, newResult] :
+         llvm::zip(op->getResults(), cloned->getResults()))
+      mapping.map(oldResult, newResult);
+    return success();
+  }
+
+  LogicalResult lowerRemainingUnrealizedCasts(ModuleOp moduleOp) {
+    SmallVector<UnrealizedConversionCastOp> initialCasts;
+    moduleOp.walk(
+        [&](UnrealizedConversionCastOp cast) { initialCasts.push_back(cast); });
+    reconcileUnrealizedCasts(initialCasts);
+
+    SmallVector<UnrealizedConversionCastOp> remainingCasts;
+    moduleOp.walk([&](UnrealizedConversionCastOp cast) {
+      remainingCasts.push_back(cast);
+    });
+
+    for (auto cast : remainingCasts) {
+      if (cast->getNumOperands() != 1 || cast->getNumResults() != 1)
+        return cast->emitOpError("unsupported unrealized conversion cast arity "
+                                 "after EmitC conversion");
+
+      Value input = cast.getOperands().front();
+      Type srcType = input.getType();
+      Type dstType = cast.getResult(0).getType();
+      if (srcType == dstType) {
+        cast.getResult(0).replaceAllUsesWith(input);
+        cast.erase();
+        continue;
+      }
+
+      OpBuilder builder(cast);
+      Value replacement;
+      auto ptrDiffTy = emitc::PtrDiffTType::get(moduleOp.getContext());
+
+      if (srcType.isIndex() && isa<emitc::SizeTType>(dstType)) {
+        Value signedSize =
+            emitc::CastOp::create(builder, cast.getLoc(), ptrDiffTy, input);
+        replacement =
+            emitc::CastOp::create(builder, cast.getLoc(), dstType, signedSize);
+      } else if (isa<emitc::SizeTType>(srcType) && dstType.isIndex()) {
+        Value signedSize =
+            emitc::CastOp::create(builder, cast.getLoc(), ptrDiffTy, input);
+        replacement =
+            emitc::CastOp::create(builder, cast.getLoc(), dstType, signedSize);
+      } else {
+        return cast->emitOpError(
+            "unsupported unrealized conversion cast after EmitC conversion");
+      }
+
+      cast.getResult(0).replaceAllUsesWith(replacement);
+      cast.erase();
+    }
+
+    SmallVector<UnrealizedConversionCastOp> finalCasts;
+    moduleOp.walk(
+        [&](UnrealizedConversionCastOp cast) { finalCasts.push_back(cast); });
+    if (!finalCasts.empty()) {
+      finalCasts.front()->emitOpError("unresolved unrealized conversion casts "
+                                      "remain after EmitC conversion");
+      return failure();
+    }
+    return success();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+xilinx::createConvertAIEXToEmitCPass() {
+  return std::make_unique<ConvertAIEXToEmitCPass>();
+}
diff --git a/lib/Conversion/AIEXToEmitC/CMakeLists.txt b/lib/Conversion/AIEXToEmitC/CMakeLists.txt
new file mode 100644
index 00000000000..fb1df45d982
--- /dev/null
+++ b/lib/Conversion/AIEXToEmitC/CMakeLists.txt
@@ -0,0 +1,28 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc.
+add_mlir_conversion_library(MLIRAIEXToEmitC
+  AIEXToEmitC.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/aie/Conversion/AIEXToEmitC
+
+  DEPENDS
+  MLIRAIEConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  AIE
+  AIEX
+  MLIRArithToEmitC
+  MLIREmitCDialect
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRSCFToEmitC
+  MLIRSCFDialect
+  MLIRTransforms
+)
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 660e9c66694..6e851e33e45 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -4,4 +4,5 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc.
 add_subdirectory(AIEToConfiguration)
-add_subdirectory(AIEVecToLLVM)
\ No newline at end of file
+add_subdirectory(AIEVecToLLVM)
+add_subdirectory(AIEXToEmitC)
\ No newline at end of file
diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
index a0f4d712b6a..983cd174b61 100644
--- a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -52,7 +52,8 @@ struct AIEAssignCoreLinkFilesPass
     DenseMap<StringRef, SmallVector<StringRef, 2>> funcToObjs;
     for (auto funcOp : device.getOps<mlir::func::FuncOp>()) {
       if (auto attr = funcOp->getAttrOfType<mlir::StringAttr>("link_with")) {
-        funcToObjs[funcOp.getName()].push_back(attr.getValue());
+        if (!attr.getValue().empty())
+          funcToObjs[funcOp.getName()].push_back(attr.getValue());
       }
     }
 
@@ -72,10 +73,12 @@ struct AIEAssignCoreLinkFilesPass
 
       // Migrate deprecated core-level attr: warn, consume it, and add to set.
       if (auto lw = core.getLinkWith()) {
-        core.emitWarning(
-            "link_with on aie.core is deprecated; attach link_with to "
-            "the func.func declaration instead");
-        needed.insert(lw.value());
+        if (!lw.value().empty()) {
+          core.emitWarning(
+              "link_with on aie.core is deprecated; attach link_with to "
+              "the func.func declaration instead");
+          needed.insert(lw.value());
+        }
         core->removeAttr("link_with");
       }
 
diff --git a/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp
index feb06366981..a7b6a3f4447 100644
--- a/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp
@@ -618,7 +618,8 @@ struct AIEInsertTraceFlowsPass
                                               shimInfo.shimTile, targetModel);
         xilinx::AIEX::NpuAddressPatchOp::create(builder, runtimeSeq.getLoc(),
                                                 bdAddress, chanDesc.argIdx,
-                                                chanDesc.bufferOffset);
+                                                chanDesc.bufferOffset,
+                                                /*dyn_arg_plus=*/Value{});
 
         // 4e. DMA channel configuration — set Controller_ID from tile attribute
         uint32_t ctrlAddr =
diff --git a/lib/Dialect/AIE/Transforms/AIEVectorTransferLowering.cpp b/lib/Dialect/AIE/Transforms/AIEVectorTransferLowering.cpp
index d2f29d98f94..0a8506c5766 100644
--- a/lib/Dialect/AIE/Transforms/AIEVectorTransferLowering.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEVectorTransferLowering.cpp
@@ -63,7 +63,15 @@ struct AIEVectorTransferLoweringPass
     RewritePatternSet patterns(context);
     vector::populateVectorTransferLoweringPatterns(patterns, maxRank);
 
-    if (failed(applyPatternsGreedily(deviceOp, std::move(patterns))))
+    // Disable cross-region constant CSE to prevent the greedy rewriter from
+    // hoisting arith.constant ops from inside aie.runtime_sequence up to the
+    // enclosing aie.device scope. Without this, the default cseConstants=true
+    // can make aie.core bodies reference device-scope values, which breaks
+    // AIECoreToStandardPass when it clones the core body into a func.func.
+    GreedyRewriteConfig config;
+    config.enableConstantCSE(false);
+
+    if (failed(applyPatternsGreedily(deviceOp, std::move(patterns), config)))
       signalPassFailure();
   }
 };
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 103c08b1b85..f7fb4f19ae1 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -537,18 +537,22 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
     return emitOpError("Minimum data transfer size required is ")
            << addressGranularity << "bits. ";
   }
-  if (!llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
-        return getConstantIntValue(s).has_value();
-      }))
-    return emitOpError("Only constant strides currently supported.");
-  if (!llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
-        return getConstantIntValue(s).has_value();
-      }))
-    return emitOpError("Only constant sizes currently supported.");
-  if (!llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
-        return getConstantIntValue(s).has_value();
-      }))
-    return emitOpError("Only constant offsets currently supported.");
+  // Check if all sizes, strides, and offsets are compile-time constants.
+  // When any are SSA values (dynamic), skip the constant-only verification
+  // since those checks cannot be performed at compile time.
+  bool allSizesConstant = llvm::all_of(getMixedSizes(), [](OpFoldResult s) {
+    return getConstantIntValue(s).has_value();
+  });
+  bool allStridesConstant = llvm::all_of(getMixedStrides(), [](OpFoldResult s) {
+    return getConstantIntValue(s).has_value();
+  });
+  bool allOffsetsConstant = llvm::all_of(getMixedOffsets(), [](OpFoldResult s) {
+    return getConstantIntValue(s).has_value();
+  });
+
+  // Skip detailed stride/size/offset verification when values are dynamic.
+  if (!allSizesConstant || !allStridesConstant || !allOffsetsConstant)
+    return success();
 
   llvm::SmallVector<int64_t, 4> inputSizes =
       llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
@@ -747,17 +751,368 @@ static std::optional<uint32_t> getAbsoluteAddress(T *op) {
 }
 
 std::optional<uint32_t> AIEX::NpuWrite32Op::getAbsoluteAddress() {
+  if (hasDynamicOperands())
+    return std::nullopt;
   return ::getAbsoluteAddress(this);
 }
 
+//===----------------------------------------------------------------------===//
+// NpuWrite32Op parse/print/verify
+//===----------------------------------------------------------------------===//
+
+/// Parse: `aiex.npu.write32(%addr, %val) {attrs} : type, type`
+/// or:    `aiex.npu.write32 {attrs}`
+ParseResult AIEX::NpuWrite32Op::parse(OpAsmParser &parser,
+                                      OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 2> dynOperands;
+  SmallVector<Type, 2> dynTypes;
+
+  // Try to parse optional `(operands)` for dynamic form
+  bool hasDynamic = false;
+  if (succeeded(parser.parseOptionalLParen())) {
+    hasDynamic = true;
+    OpAsmParser::UnresolvedOperand addr, val;
+    if (parser.parseOperand(addr) || parser.parseComma() ||
+        parser.parseOperand(val) || parser.parseRParen())
+      return failure();
+    dynOperands.push_back(addr);
+    dynOperands.push_back(val);
+  }
+
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  if (hasDynamic) {
+    if (parser.parseColon() || parser.parseType(dynTypes.emplace_back()) ||
+        parser.parseComma() || parser.parseType(dynTypes.emplace_back()))
+      return failure();
+    if (parser.resolveOperand(dynOperands[0], dynTypes[0], result.operands) ||
+        parser.resolveOperand(dynOperands[1], dynTypes[1], result.operands))
+      return failure();
+  }
+
+  // Set operand segment sizes: [dyn_address, dyn_value]
+  result.addAttribute("operandSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {hasDynamic ? 1 : 0, hasDynamic ? 1 : 0}));
+
+  return success();
+}
+
+void AIEX::NpuWrite32Op::print(OpAsmPrinter &p) {
+  if (hasDynamicOperands()) {
+    p << '(' << getDynAddress() << ", " << getDynValue() << ')';
+  }
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elidedAttrs=*/{"operandSegmentSizes"});
+  if (hasDynamicOperands()) {
+    p << " : " << getDynAddress().getType() << ", " << getDynValue().getType();
+  }
+}
+
+LogicalResult AIEX::NpuWrite32Op::verify() {
+  bool hasAddr = getDynAddress() != nullptr;
+  bool hasVal = getDynValue() != nullptr;
+  if (hasAddr != hasVal)
+    return emitOpError(
+        "dynamic operands must be provided together (both or neither)");
+
+  if (hasAddr) {
+    auto addrType = getDynAddress().getType();
+    auto valType = getDynValue().getType();
+    if (auto intType = dyn_cast<IntegerType>(addrType)) {
+      if (intType.getWidth() != 32)
+        return emitOpError("dynamic address must be a 32-bit integer");
+    }
+    if (auto intType = dyn_cast<IntegerType>(valType)) {
+      if (intType.getWidth() != 32)
+        return emitOpError("dynamic value must be a 32-bit integer");
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// NpuWriteRTPOp parse/print/verify
+//===----------------------------------------------------------------------===//
+
+/// Parse: `aiex.npu.rtp_write(@buffer, 0 : ui32, 42 : i32)` (static)
+/// or:    `aiex.npu.rtp_write(@buffer, 0 : ui32, %val) : i32` (dynamic)
+ParseResult AIEX::NpuWriteRTPOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  if (parser.parseLParen())
+    return failure();
+
+  // Parse buffer symbol ref
+  FlatSymbolRefAttr bufferAttr;
+  if (parser.parseAttribute(bufferAttr))
+    return failure();
+  result.addAttribute("buffer", bufferAttr);
+
+  if (parser.parseComma())
+    return failure();
+
+  // Parse index attribute (ui32). Accepts both bare (0) and typed (0 : ui32).
+  IntegerAttr indexAttr;
+  if (parser.parseAttribute(indexAttr))
+    return failure();
+  auto ui32Type = parser.getBuilder().getIntegerType(32, /*isSigned=*/false);
+  if (indexAttr.getType() != ui32Type)
+    indexAttr = IntegerAttr::get(ui32Type, indexAttr.getInt());
+  result.addAttribute("index", indexAttr);
+
+  if (parser.parseComma())
+    return failure();
+
+  // Try to parse dynamic value (SSA operand %val), else parse static attr.
+  OpAsmParser::UnresolvedOperand dynVal;
+  bool hasDynamic = false;
+
+  auto optResult = parser.parseOptionalOperand(dynVal);
+  if (optResult.has_value()) {
+    if (failed(*optResult))
+      return failure();
+    hasDynamic = true;
+  } else {
+    // Static value. Accepts both bare (42) and typed (42 : i32).
+    IntegerAttr valueAttr;
+    if (parser.parseAttribute(valueAttr))
+      return failure();
+    auto i32Type = parser.getBuilder().getIntegerType(32);
+    if (valueAttr.getType() != i32Type)
+      valueAttr = IntegerAttr::get(i32Type, valueAttr.getInt());
+    result.addAttribute("value", valueAttr);
+  }
+
+  if (parser.parseRParen())
+    return failure();
+
+  if (hasDynamic) {
+    Type dynType;
+    if (parser.parseColon() || parser.parseType(dynType))
+      return failure();
+    if (parser.resolveOperand(dynVal, dynType, result.operands))
+      return failure();
+  }
+
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+void AIEX::NpuWriteRTPOp::print(OpAsmPrinter &p) {
+  p << "(";
+  p.printAttribute(getBufferAttr());
+  p << ", ";
+  p.printAttribute(getIndexAttr());
+  p << ", ";
+  if (hasDynamicValue()) {
+    p.printOperand(getDynValue());
+    p << ")";
+  } else {
+    p.printAttribute(getValueAttr());
+    p << ")";
+  }
+
+  // Print type annotation for dynamic case (before attr-dict, matching parser)
+  if (hasDynamicValue()) {
+    p << " : " << getDynValue().getType();
+  }
+
+  // Elide attributes that are printed inline or handled by traits
+  SmallVector<StringRef> elidedAttrs = {"buffer", "index",
+                                        "operandSegmentSizes"};
+  if (!hasDynamicValue())
+    elidedAttrs.push_back("value");
+  p.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs);
+}
+
+LogicalResult AIEX::NpuWriteRTPOp::verify() {
+  bool hasStaticVal = getValueAttr() != nullptr;
+  bool hasDynVal = getDynValue() != nullptr;
+
+  if (hasStaticVal == hasDynVal)
+    return emitOpError(
+        "exactly one of 'value' (static) or 'dyn_value' (dynamic) "
+        "must be provided");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NpuMaskWrite32Op
 //===----------------------------------------------------------------------===//
 
 std::optional<uint32_t> AIEX::NpuMaskWrite32Op::getAbsoluteAddress() {
+  if (hasDynamicOperands())
+    return std::nullopt;
   return ::getAbsoluteAddress(this);
 }
 
+/// Parse: `aiex.npu.maskwrite32(%addr, %val, %mask) {attrs} : type, type, type`
+/// or:    `aiex.npu.maskwrite32 {attrs}`
+ParseResult AIEX::NpuMaskWrite32Op::parse(OpAsmParser &parser,
+                                          OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 3> dynOperands;
+  SmallVector<Type, 3> dynTypes;
+
+  bool hasDynamic = false;
+  if (succeeded(parser.parseOptionalLParen())) {
+    hasDynamic = true;
+    OpAsmParser::UnresolvedOperand addr, val, mask;
+    if (parser.parseOperand(addr) || parser.parseComma() ||
+        parser.parseOperand(val) || parser.parseComma() ||
+        parser.parseOperand(mask) || parser.parseRParen())
+      return failure();
+    dynOperands.push_back(addr);
+    dynOperands.push_back(val);
+    dynOperands.push_back(mask);
+  }
+
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  if (hasDynamic) {
+    if (parser.parseColon() || parser.parseType(dynTypes.emplace_back()) ||
+        parser.parseComma() || parser.parseType(dynTypes.emplace_back()) ||
+        parser.parseComma() || parser.parseType(dynTypes.emplace_back()))
+      return failure();
+    for (unsigned i = 0; i < 3; ++i)
+      if (parser.resolveOperand(dynOperands[i], dynTypes[i], result.operands))
+        return failure();
+  }
+
+  // Set operand segment sizes: [dyn_address, dyn_value, dyn_mask]
+  int seg = hasDynamic ? 1 : 0;
+  result.addAttribute(
+      "operandSegmentSizes",
+      parser.getBuilder().getDenseI32ArrayAttr({seg, seg, seg}));
+
+  return success();
+}
+
+void AIEX::NpuMaskWrite32Op::print(OpAsmPrinter &p) {
+  if (hasDynamicOperands()) {
+    p << '(' << getDynAddress() << ", " << getDynValue() << ", " << getDynMask()
+      << ')';
+  }
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elidedAttrs=*/{"operandSegmentSizes"});
+  if (hasDynamicOperands()) {
+    p << " : " << getDynAddress().getType() << ", " << getDynValue().getType()
+      << ", " << getDynMask().getType();
+  }
+}
+
+LogicalResult AIEX::NpuMaskWrite32Op::verify() {
+  bool hasAddr = getDynAddress() != nullptr;
+  bool hasVal = getDynValue() != nullptr;
+  bool hasMask = getDynMask() != nullptr;
+  if (hasAddr != hasVal || hasAddr != hasMask)
+    return emitOpError(
+        "dynamic operands must be provided together (all or none)");
+
+  if (hasAddr) {
+    for (Value v : {getDynAddress(), getDynValue(), getDynMask()}) {
+      if (auto intType = dyn_cast<IntegerType>(v.getType())) {
+        if (intType.getWidth() != 32)
+          return emitOpError("dynamic operands must be 32-bit integers");
+      }
+    }
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// NpuSyncOp parse/print/verify
+//===----------------------------------------------------------------------===//
+
+/// Parse: `aiex.npu.sync(%c, %r, %d, %ch, %cn, %rn) {attrs} : i32, ...`
+/// or:    `aiex.npu.sync {attrs}`
+ParseResult AIEX::NpuSyncOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 6> dynOperands;
+  SmallVector<Type, 6> dynTypes;
+
+  bool hasDynamic = false;
+  if (succeeded(parser.parseOptionalLParen())) {
+    hasDynamic = true;
+    for (unsigned i = 0; i < 6; ++i) {
+      if (i > 0 && parser.parseComma())
+        return failure();
+      dynOperands.emplace_back();
+      if (parser.parseOperand(dynOperands.back()))
+        return failure();
+    }
+    if (parser.parseRParen())
+      return failure();
+  }
+
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  if (hasDynamic) {
+    if (parser.parseColon())
+      return failure();
+    for (unsigned i = 0; i < 6; ++i) {
+      if (i > 0 && parser.parseComma())
+        return failure();
+      dynTypes.emplace_back();
+      if (parser.parseType(dynTypes.back()))
+        return failure();
+    }
+    for (unsigned i = 0; i < 6; ++i)
+      if (parser.resolveOperand(dynOperands[i], dynTypes[i], result.operands))
+        return failure();
+  }
+
+  // Set operand segment sizes: [dyn_column..dyn_row_num] (6 segments)
+  int seg = hasDynamic ? 1 : 0;
+  result.addAttribute(
+      "operandSegmentSizes",
+      parser.getBuilder().getDenseI32ArrayAttr({seg, seg, seg, seg, seg, seg}));
+
+  return success();
+}
+
+void AIEX::NpuSyncOp::print(OpAsmPrinter &p) {
+  if (hasDynamicOperands()) {
+    p << '(' << getDynColumn() << ", " << getDynRow() << ", "
+      << getDynDirection() << ", " << getDynChannel() << ", "
+      << getDynColumnNum() << ", " << getDynRowNum() << ')';
+  }
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elidedAttrs=*/{"operandSegmentSizes"});
+  if (hasDynamicOperands()) {
+    p << " : " << getDynColumn().getType() << ", " << getDynRow().getType()
+      << ", " << getDynDirection().getType() << ", "
+      << getDynChannel().getType() << ", " << getDynColumnNum().getType()
+      << ", " << getDynRowNum().getType();
+  }
+}
+
+LogicalResult AIEX::NpuSyncOp::verify() {
+  bool hasAny = getDynColumn() != nullptr;
+  bool allPresent = hasAny && getDynRow() != nullptr &&
+                    getDynDirection() != nullptr &&
+                    getDynChannel() != nullptr &&
+                    getDynColumnNum() != nullptr && getDynRowNum() != nullptr;
+  if (hasAny && !allPresent)
+    return emitOpError(
+        "dynamic operands must be provided together (all or none)");
+
+  if (hasAny) {
+    for (Value v : {getDynColumn(), getDynRow(), getDynDirection(),
+                    getDynChannel(), getDynColumnNum(), getDynRowNum()}) {
+      if (!v.getType().isSignlessInteger(32))
+        return emitOpError(
+            "all dynamic operands must be 32-bit signless integers");
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NpuBlockWriteOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
index 1bef5b29fa1..2bff26e3460 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <iterator>
 
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIEX/AIEUtils.h"
@@ -98,7 +97,11 @@ struct AIEDMATasksToNPUPass
   LogicalResult verifyBdInBlock(Block &block) {
     auto bd_ops = block.getOps<AIE::DMABDOp>();
     // Exactly one BD op per block
-    int n_bd_ops = std::distance(bd_ops.begin(), bd_ops.end());
+    int n_bd_ops = 0;
+    for ([[maybe_unused]] auto op : bd_ops) {
+      if (++n_bd_ops > 1)
+        break;
+    }
     if (n_bd_ops < 1) {
       auto error = block.getTerminator()->emitError(
           "Block ending in this terminator does not contain a required "
@@ -133,7 +136,11 @@ struct AIEDMATasksToNPUPass
 
   LogicalResult verifyOptionalLocksInBlock(Block &block) {
     auto lock_ops = block.getOps<AIE::UseLockOp>();
-    int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
+    int n_lock_ops = 0;
+    for ([[maybe_unused]] auto op : lock_ops) {
+      if (++n_lock_ops > 2)
+        break;
+    }
     // Allow exactly 0 or 2 lock ops (acquire and release)
     if (n_lock_ops != 0 && n_lock_ops != 2) {
       AIE::UseLockOp lock_op = *lock_ops.begin();
@@ -183,7 +190,11 @@ struct AIEDMATasksToNPUPass
   std::optional<std::pair<AIE::UseLockOp, AIE::UseLockOp>>
   getOptionalLockOpsForBlock(Block &block) {
     auto lock_ops = block.getOps<AIE::UseLockOp>();
-    int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
+    int n_lock_ops = 0;
+    for ([[maybe_unused]] auto op : lock_ops) {
+      if (++n_lock_ops > 2)
+        break;
+    }
     if (n_lock_ops != 2) {
       return std::nullopt;
     }
@@ -241,7 +252,8 @@ struct AIEDMATasksToNPUPass
       NpuAddressPatchOp::create(builder, bd_op.getLoc(),
                                 /*addr*/ register_addr,
                                 /*arg_idx*/ arg_idx,
-                                /*arg_plus*/ offset);
+                                /*arg_plus*/ offset,
+                                /*dyn_arg_plus=*/Value{});
     } else if (AIE::BufferOp buffer =
                    llvm::dyn_cast<AIE::BufferOp>(buf.getDefiningOp())) {
       uint64_t buf_addr;
@@ -665,6 +677,7 @@ struct AIEDMATasksToNPUPass
     patterns.insert<DMAAwaitTaskOpPattern>(&getContext());
     if (failed(applyPartialConversion(device, target, std::move(patterns)))) {
       signalPassFailure();
+      return;
     }
 
     // Lower the configuration for the BDs
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index 691d53e844e..cab2e1d9691 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -14,6 +14,7 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -130,9 +131,32 @@ struct RtpToWrite32Pattern : OpConversionPattern<NpuWriteRTPOp> {
     uint32_t idx = op.getIndex() * sizeof(uint32_t);
     uint32_t address = buffer.getAddress().value() + idx;
 
-    NpuWrite32Op::create(rewriter, op->getLoc(), address, op.getValue(),
-                         nullptr, rewriter.getI32IntegerAttr(tile.getCol()),
-                         rewriter.getI32IntegerAttr(tile.getRow()));
+    if (op.hasDynamicValue()) {
+      // Dynamic RTP write: compute absolute address, pass value as SSA
+      const AIE::AIETargetModel &tm = device.getTargetModel();
+      uint32_t colShift = tm.getColumnShift();
+      uint32_t rowShift = tm.getRowShift();
+      uint32_t absAddr = (static_cast<uint32_t>(tile.getCol()) << colShift) |
+                         (static_cast<uint32_t>(tile.getRow()) << rowShift) |
+                         (address & 0xfffff);
+
+      auto i32Type = rewriter.getI32Type();
+      auto addrConst = arith::ConstantOp::create(
+          rewriter, op->getLoc(), rewriter.getIntegerAttr(i32Type, absAddr));
+
+      NpuWrite32Op::create(rewriter, op->getLoc(),
+                           /*address=*/0u, /*value=*/0u,
+                           /*buffer=*/nullptr, /*column=*/nullptr,
+                           /*row=*/nullptr,
+                           /*dyn_address=*/addrConst.getResult(),
+                           /*dyn_value=*/adaptor.getDynValue());
+    } else {
+      // Static path
+      NpuWrite32Op::create(rewriter, op->getLoc(), address,
+                           static_cast<uint32_t>(*op.getValue()), nullptr,
+                           rewriter.getI32IntegerAttr(tile.getCol()),
+                           rewriter.getI32IntegerAttr(tile.getRow()));
+    }
 
     rewriter.eraseOp(op);
     return success();
@@ -191,6 +215,50 @@ struct PushQueuetoWrite32Pattern : OpConversionPattern<NpuPushQueueOp> {
   }
 };
 
+/// Get an OpFoldResult as an SSA Value of type intType, creating a constant
+/// if needed. If the SSA value has a different width, truncate or extend it.
+static Value getAsValue(OpBuilder &builder, Location loc, OpFoldResult ofr,
+                        Type intType) {
+  if (auto constVal = getConstantIntValue(ofr))
+    return arith::ConstantOp::create(builder, loc,
+                                     IntegerAttr::get(intType, *constVal));
+  Value val = cast<Value>(ofr);
+  if (val.getType() != intType) {
+    unsigned valBits = val.getType().getIntOrFloatBitWidth();
+    unsigned tgtBits = intType.getIntOrFloatBitWidth();
+    if (valBits > tgtBits)
+      val = arith::TruncIOp::create(builder, loc, intType, val);
+    else
+      val = arith::ExtUIOp::create(builder, loc, intType, val);
+  }
+  return val;
+}
+
+/// Build a BD word from a list of (value, mask, shift) tuples using arith ops.
+/// word = (field1 & mask1) << shift1 | (field2 & mask2) << shift2 | ...
+static Value
+buildBdWord(OpBuilder &builder, Location loc,
+            ArrayRef<std::tuple<Value, uint32_t, uint32_t>> fields) {
+  auto i32ty = IntegerType::get(builder.getContext(), 32);
+  Value result =
+      arith::ConstantOp::create(builder, loc, IntegerAttr::get(i32ty, 0));
+  for (auto &[val, mask, shift] : fields) {
+    Value masked = val;
+    if (mask != 0xFFFFFFFF) {
+      auto maskConst = arith::ConstantOp::create(builder, loc,
+                                                 IntegerAttr::get(i32ty, mask));
+      masked = arith::AndIOp::create(builder, loc, val, maskConst);
+    }
+    if (shift > 0) {
+      auto shiftConst = arith::ConstantOp::create(
+          builder, loc, IntegerAttr::get(i32ty, shift));
+      masked = arith::ShLIOp::create(builder, loc, masked, shiftConst);
+    }
+    result = arith::OrIOp::create(builder, loc, result, masked);
+  }
+  return result;
+}
+
 struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -229,74 +297,287 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     int tileCol = shimTile.getCol();
     int tileRow = shimTile.getRow();
 
-    // initialize fields to zero
-    auto column = zero;
-    auto bd_id = zero;
-    auto buffer_length = zero;
-    auto buffer_offset = zero;
-    auto enable_packet = zero;
-    auto out_of_order_id = zero;
-    auto packet_id = zero;
-    auto packet_type = zero;
-    auto d0_size = zero;
-    auto d0_stride = zero;
-    auto d1_size = zero;
-    auto d1_stride = zero;
-    auto d2_size = zero;
-    auto d2_stride = zero;
-    auto iteration_current = zero;
-    auto iteration_size = zero;
-    auto iteration_stride = zero;
-    auto next_bd = zero;
-    auto row = zero;
-    auto use_next_bd = zero;
-    auto valid_bd = zero;
-    auto lock_rel_val = zero;
-    auto lock_rel_id = zero;
-    auto lock_acq_enable = zero;
-    auto lock_acq_val = zero;
-    auto lock_acq_id = zero;
-    auto d0_zero_before = zero;
-    auto d1_zero_before = zero;
-    auto d2_zero_before = zero;
-    auto d0_zero_after = zero;
-    auto d1_zero_after = zero;
-    auto d2_zero_after = zero;
-    auto burst_length = zero;
-
-    auto issue_token = BoolAttr::get(ctx, false);
-    auto repeat_count = zero;
-    llvm::SmallVector<int64_t, 4> inputSizes = llvm::map_to_vector(
-        llvm::reverse(op.getMixedSizes()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> inputStrides = llvm::map_to_vector(
-        llvm::reverse(op.getMixedStrides()),
-        [](OpFoldResult s) { return getConstantIntValue(s).value(); });
-    llvm::SmallVector<int64_t, 4> sizes(4);
-    llvm::SmallVector<int64_t, 4> strides(4);
-    getHardwareStridesWraps(targetModel, op, bufferType, inputSizes,
-                            inputStrides, sizes, strides);
-    int64_t offset = op.getOffsetInBytes();
-
-    // column
-    column = IntegerAttr::get(i32ty, tileCol);
-
-    // row
-    row = IntegerAttr::get(i32ty, tileRow);
-
-    // A contiguous row-major ND access on a shim NOC tile is lowered to linear
-    // mode (d0_size=d1_size=0) just like an already-canonical linear transfer.
-    // This allows naturally-expressed multidimensional transfers (e.g., a 2D
-    // image as [height, width]) without hitting the 10-bit ND wrap-size limit.
-    bool isLinear = op.isLinearTransferWithoutTransformation() ||
-                    (targetModel.isShimNOCTile(tileCol, tileRow) &&
-                     isContiguousTransfer(inputSizes, inputStrides));
-    if (failed(verifyStridesWraps(op, bufferType, tileCol, tileRow, inputSizes,
-                                  inputStrides, sizes, strides, isLinear))) {
-      return failure();
+    // Check whether all sizes and strides are compile-time constants.
+    bool allSizesConstant =
+        llvm::all_of(op.getMixedSizes(), [](OpFoldResult s) {
+          return getConstantIntValue(s).has_value();
+        });
+    bool allStridesConstant =
+        llvm::all_of(op.getMixedStrides(), [](OpFoldResult s) {
+          return getConstantIntValue(s).has_value();
+        });
+
+    if (allSizesConstant && allStridesConstant) {
+      // =====================================================================
+      // STATIC CODE PATH -- all sizes/strides are constants
+      // =====================================================================
+
+      // initialize fields to zero
+      auto column = zero;
+      auto bd_id = zero;
+      auto buffer_length = zero;
+      auto buffer_offset = zero;
+      auto enable_packet = zero;
+      auto out_of_order_id = zero;
+      auto packet_id = zero;
+      auto packet_type = zero;
+      auto d0_size = zero;
+      auto d0_stride = zero;
+      auto d1_size = zero;
+      auto d1_stride = zero;
+      auto d2_size = zero;
+      auto d2_stride = zero;
+      auto iteration_current = zero;
+      auto iteration_size = zero;
+      auto iteration_stride = zero;
+      auto next_bd = zero;
+      auto row = zero;
+      auto use_next_bd = zero;
+      auto valid_bd = zero;
+      auto lock_rel_val = zero;
+      auto lock_rel_id = zero;
+      auto lock_acq_enable = zero;
+      auto lock_acq_val = zero;
+      auto lock_acq_id = zero;
+      auto d0_zero_before = zero;
+      auto d1_zero_before = zero;
+      auto d2_zero_before = zero;
+      auto d0_zero_after = zero;
+      auto d1_zero_after = zero;
+      auto d2_zero_after = zero;
+      auto burst_length = zero;
+
+      auto issue_token = BoolAttr::get(ctx, false);
+      auto repeat_count = zero;
+      llvm::SmallVector<int64_t, 4> inputSizes = llvm::map_to_vector(
+          llvm::reverse(op.getMixedSizes()),
+          [](OpFoldResult s) { return getConstantIntValue(s).value(); });
+      llvm::SmallVector<int64_t, 4> inputStrides = llvm::map_to_vector(
+          llvm::reverse(op.getMixedStrides()),
+          [](OpFoldResult s) { return getConstantIntValue(s).value(); });
+      llvm::SmallVector<int64_t, 4> sizes(4);
+      llvm::SmallVector<int64_t, 4> strides(4);
+      getHardwareStridesWraps(targetModel, op, bufferType, inputSizes,
+                              inputStrides, sizes, strides);
+      int64_t offset = op.getOffsetInBytes();
+
+      // column
+      column = IntegerAttr::get(i32ty, tileCol);
+
+      // row
+      row = IntegerAttr::get(i32ty, tileRow);
+
+      // A contiguous row-major ND access on a shim NOC tile is lowered to
+      // linear mode (d0_size=d1_size=0) just like an already-canonical linear
+      // transfer. This allows naturally-expressed multidimensional transfers
+      // (e.g., a 2D image as [height, width]) without hitting the 10-bit ND
+      // wrap-size limit.
+      bool isLinear = op.isLinearTransferWithoutTransformation() ||
+                      (targetModel.isShimNOCTile(tileCol, tileRow) &&
+                       isContiguousTransfer(inputSizes, inputStrides));
+      if (failed(verifyStridesWraps(op, bufferType, tileCol, tileRow,
+                                    inputSizes, inputStrides, sizes, strides,
+                                    isLinear))) {
+        return failure();
+      }
+
+      // arg_idx and offset for block arguments
+      AIE::RuntimeSequenceOp seq_op =
+          op->getParentOfType<AIE::RuntimeSequenceOp>();
+      if (!seq_op) {
+        op->emitOpError(
+            "NpuDmaMemcpyNdOps must have RuntimeSequenceOp parent at "
+            "time of lowering.");
+        return failure();
+      }
+
+      mlir::Value rootMemref = memref;
+      int64_t subviewOffset = 0;
+
+      // Trace through memref.subview and memref.reinterpret_cast chain, if
+      // any, to find root block argument
+      auto traceResult = traceSubviewToBlockArgument(memref);
+      if (!traceResult) {
+        return op->emitOpError(
+            "memref must be a block argument or subview/cast/reinterpret_cast "
+            "of a block argument with static offsets, sizes, and strides");
+      }
+      rootMemref = traceResult->rootArg;
+      subviewOffset = traceResult->offsetInBytes;
+
+      // Find the argument index of the root memref
+      Block &entryBB = seq_op.getBody().front();
+      int arg_idx = -1;
+      for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
+        if (entryBB.getArgument(i) == rootMemref) {
+          arg_idx = i;
+          break;
+        }
+      }
+      if (arg_idx < 0)
+        return failure();
+
+      offset += subviewOffset;
+
+      // bd_id
+      bd_id = IntegerAttr::get(i32ty, op.getId());
+
+      // buffer_length
+      uint64_t buffer_length_val = inputSizes[0] * op.getElementTypeBitwidth() /
+                                   targetModel.getAddressGenGranularity();
+      if (inputSizes.size() > 1) {
+        for (size_t i = 1; i < std::min(inputSizes.size(), (size_t)3); i++) {
+          buffer_length_val *= inputSizes[i];
+        }
+      }
+      buffer_length = IntegerAttr::get(i32ty, buffer_length_val);
+
+      // buffer_offset - zero because the complete address is set by the
+      // patch op
+      buffer_offset = IntegerAttr::get(i32ty, 0);
+
+      // enable_packet
+      if (auto packetInfo = op.getPacket()) {
+        enable_packet = IntegerAttr::get(i32ty, 1);
+        packet_type = IntegerAttr::get(i32ty, packetInfo->getPktType());
+        packet_id = IntegerAttr::get(i32ty, packetInfo->getPktId());
+      }
+
+      // out_of_order_id
+
+      if (!isLinear) {
+        // d0_size, d0_stride
+        d0_size = IntegerAttr::get(i32ty, sizes[0]);
+        d0_stride = IntegerAttr::get(i32ty, strides[0]);
+
+        // d1_size, d1_stride
+        d1_size = IntegerAttr::get(i32ty, sizes[1]);
+        d1_stride = IntegerAttr::get(i32ty, strides[1]);
+
+        // d2_stride
+        d2_stride = IntegerAttr::get(i32ty, strides[2]);
+
+        // d2_size
+        if (targetModel.isMemTile(tileCol, 0)) // Need to be any row
+          d2_size = IntegerAttr::get(i32ty, sizes[2]);
+        else
+          d2_size = IntegerAttr::get(i32ty, 0);
+      }
+      // iteration_current, iteration_size, iteration_stride, repeat_count
+      if (inputSizes[3] > 1) {
+        if (inputStrides[3] > 0) {
+          iteration_size = IntegerAttr::get(i32ty, sizes[3]);
+          iteration_stride = IntegerAttr::get(i32ty, strides[3]);
+        } else {
+          // We allow users to encode the repeat_count as a dimension 3 stride
+          // of 0. This must lower to a iteration wrap of 0, so no stride is
+          // ever added. We then repeat the BD using the repeat_count in
+          // NpuPushQueueOp.
+          iteration_size = zero;
+          iteration_stride = zero;
+        }
+      }
+      repeat_count = IntegerAttr::get(i32ty, sizes[3]);
+
+      // next_bd
+
+      // use_next_bd
+
+      // valid_bd
+      valid_bd = IntegerAttr::get(i32ty, 1);
+
+      // lock_rel_val
+
+      // lock_rel_id
+
+      // lock_acq_enable
+
+      // lock_acq_val
+
+      // lock_acq_id
+
+      // d0_zero_before
+      d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore());
+
+      // d1_zero_before
+      d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore());
+
+      // d2_zero_before
+      d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore());
+
+      // d0_zero_after
+      d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter());
+
+      // d1_zero_after
+      d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter());
+
+      // d2_zero_after
+      d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter());
+
+      // burst_size
+      burst_length = IntegerAttr::get(i32ty, op.getBurstLength());
+
+      // Set the issue_token
+      issue_token = BoolAttr::get(ctx, op.getIssueToken());
+      // Earlier, all S2MM channels were implicitly assumed to issue a token.
+      // This logic is kept for now for backward compatibility.
+      if (!isMM2S)
+        issue_token = BoolAttr::get(ctx, true);
+
+      if (targetModel.isMemTile(tileCol, tileRow) && (!isMM2S) &&
+          (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 ||
+           op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 ||
+           op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) {
+        op->emitOpError("MemTile supports zero padding only on MM2S direction");
+        return failure();
+      }
+
+      // write the buffer descriptor to the array
+      NpuWriteBdOp::create(
+          rewriter, op->getLoc(), column, bd_id, buffer_length, buffer_offset,
+          enable_packet, out_of_order_id, packet_id, packet_type, d0_size,
+          d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current,
+          iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd,
+          lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id,
+          d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after,
+          d1_zero_after, d2_zero_after, burst_length);
+
+      // compute the location of the address to patch in the bd and emit patch
+      // instruction to perform the patch.
+      uint64_t addr =
+          targetModel.getDmaBdAddress(tileCol, tileRow, op.getId()) +
+          targetModel.getDmaBdAddressOffset(tileCol, tileRow);
+      NpuAddressPatchOp::create(rewriter, op->getLoc(), addr, arg_idx, offset,
+                                /*dyn_arg_plus=*/Value{});
+
+      // push the patched bd onto the dma task queue
+      NpuPushQueueOp::create(
+          rewriter, op->getLoc(), column, row, infoOp.getChannelDirAttr(),
+          infoOp.getChannelIndexAttr(), issue_token, repeat_count, bd_id);
+
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    // =====================================================================
+    // DYNAMIC CODE PATH -- some sizes/strides are SSA values
+    // =====================================================================
+    // We cannot use NpuWriteBdOp (which expects all-constant fields).
+    // Instead, we compute the BD words using arith ops and emit:
+    //   1. npu_blockwrite with a static BD template (all-zero or partial)
+    //   2. npu_write32_dynamic for each BD word that depends on SSA values
+    //   3. npu_address_patch for the buffer pointer (same as static path)
+    //   4. npu_write32_dynamic for queue push if repeat_count is dynamic
+
+    // Currently only ShimNOC tiles are supported for the dynamic path
+    if (!targetModel.isShimNOCTile(tileCol, tileRow)) {
+      return op->emitOpError(
+          "dynamic sizes/strides are only supported for shim tile DMAs");
     }
 
-    // arg_idx and offset for block arguments
+    Location loc = op->getLoc();
+
+    // --- Common setup: arg_idx, offset, seq_op ---
     AIE::RuntimeSequenceOp seq_op =
         op->getParentOfType<AIE::RuntimeSequenceOp>();
     if (!seq_op) {
@@ -307,19 +588,15 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
 
     mlir::Value rootMemref = memref;
     int64_t subviewOffset = 0;
-
-    // Trace through memref.subview and memref.reinterpret_cast chain, if any,
-    // to find root block argument
     auto traceResult = traceSubviewToBlockArgument(memref);
     if (!traceResult) {
       return op->emitOpError(
-          "memref must be a block argument or subview/cast/reinterpret_cast of "
-          "a block argument with static offsets, sizes, and strides");
+          "memref must be a block argument or subview/cast/reinterpret_cast "
+          "of a block argument with static offsets, sizes, and strides");
     }
     rootMemref = traceResult->rootArg;
     subviewOffset = traceResult->offsetInBytes;
 
-    // Find the argument index of the root memref
     Block &entryBB = seq_op.getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
@@ -331,123 +608,368 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     if (arg_idx < 0)
       return failure();
 
-    offset += subviewOffset;
-
-    // bd_id
-    bd_id = IntegerAttr::get(i32ty, op.getId());
-
-    // buffer_length
-    uint64_t buffer_length_val = inputSizes[0] * op.getElementTypeBitwidth() /
-                                 targetModel.getAddressGenGranularity();
-    if (inputSizes.size() > 1) {
-      for (size_t i = 1; i < std::min(inputSizes.size(), (size_t)3); i++) {
-        buffer_length_val *= inputSizes[i];
+    // Compute the byte offset. In the dynamic path, offsets may be SSA
+    // values, so we compute an SSA Value for the offset.
+    bool allOffsetsConstant =
+        llvm::all_of(op.getMixedOffsets(), [](OpFoldResult s) {
+          return getConstantIntValue(s).has_value();
+        });
+    int64_t staticOffset = subviewOffset;
+    Value dynOffset;
+    if (allOffsetsConstant && allStridesConstant) {
+      staticOffset += op.getOffsetInBytes();
+    } else {
+      // Compute offset dynamically: sum(offset[i] * stride[i]) * elemBytes
+      size_t elBitWidth = cast<MemRefType>(memref.getType())
+                              .getElementType()
+                              .getIntOrFloatBitWidth();
+      size_t elemBytes = elBitWidth / 8;
+      auto offsets = op.getMixedOffsets();
+      auto strides = op.getMixedStrides();
+      auto i64ty = IntegerType::get(ctx, 64);
+      Value sum = arith::ConstantOp::create(
+          rewriter, loc, IntegerAttr::get(i64ty, subviewOffset));
+      for (size_t i = 0; i < offsets.size(); i++) {
+        Value off = getAsValue(rewriter, loc, offsets[i], i64ty);
+        Value str = getAsValue(rewriter, loc, strides[i], i64ty);
+        Value prod = arith::MulIOp::create(rewriter, loc, off, str);
+        Value bytes = arith::MulIOp::create(
+            rewriter, loc, prod,
+            arith::ConstantOp::create(rewriter, loc,
+                                      IntegerAttr::get(i64ty, elemBytes)));
+        sum = arith::AddIOp::create(rewriter, loc, sum, bytes);
       }
+      // Truncate to i32 for the address patch offset
+      auto i32ty_ = IntegerType::get(ctx, 32);
+      dynOffset = arith::TruncIOp::create(rewriter, loc, i32ty_, sum);
     }
-    buffer_length = IntegerAttr::get(i32ty, buffer_length_val);
 
-    // buffer_offset - zero because the complete address is set by the patch op
-    buffer_offset = IntegerAttr::get(i32ty, 0);
-
-    // enable_packet
-    if (auto packetInfo = op.getPacket()) {
-      enable_packet = IntegerAttr::get(i32ty, 1);
-      packet_type = IntegerAttr::get(i32ty, packetInfo->getPktType());
-      packet_id = IntegerAttr::get(i32ty, packetInfo->getPktId());
+    // --- Retrieve mixed sizes/strides as OpFoldResults (reversed to match
+    // the convention: [d0, d1, d2, d3/iter]) ---
+    // IMPORTANT: Use adaptor operands (not op operands) because inside
+    // applyPartialConversion the original SSA values may be remapped.
+    auto buildMixed = [&](ValueRange dynVals, ArrayRef<int64_t> staticVals) {
+      SmallVector<OpFoldResult, 4> result;
+      unsigned dynIdx = 0;
+      for (int64_t sv : staticVals) {
+        if (ShapedType::isDynamic(sv))
+          result.push_back(dynVals[dynIdx++]);
+        else
+          result.push_back(rewriter.getI64IntegerAttr(sv));
+      }
+      return result;
+    };
+    SmallVector<OpFoldResult, 4> mixedSizesRev(
+        llvm::reverse(buildMixed(adaptor.getSizes(), op.getStaticSizes())));
+    SmallVector<OpFoldResult, 4> mixedStridesRev(
+        llvm::reverse(buildMixed(adaptor.getStrides(), op.getStaticStrides())));
+
+    uint64_t elemWidth = op.getElementTypeBitwidth();
+    uint32_t addrGran = targetModel.getAddressGenGranularity();
+
+    // --- Compute hardware sizes and strides as SSA Values ---
+    // This replicates getHardwareStridesWraps logic using arith ops for
+    // SSA values and folded constants for compile-time known values.
+
+    // Helper to create i32 constants
+    auto cst = [&](int64_t val) -> Value {
+      return arith::ConstantOp::create(rewriter, loc,
+                                       IntegerAttr::get(i32ty, val));
+    };
+
+    // Get each input size/stride as an SSA Value (i32)
+    Value inSize0 = getAsValue(rewriter, loc, mixedSizesRev[0], i32ty);
+    Value inSize1 = getAsValue(rewriter, loc, mixedSizesRev[1], i32ty);
+    Value inSize2 = getAsValue(rewriter, loc, mixedSizesRev[2], i32ty);
+    Value inSize3 = getAsValue(rewriter, loc, mixedSizesRev[3], i32ty);
+    Value inStride0 = getAsValue(rewriter, loc, mixedStridesRev[0], i32ty);
+    Value inStride1 = getAsValue(rewriter, loc, mixedStridesRev[1], i32ty);
+    Value inStride2 = getAsValue(rewriter, loc, mixedStridesRev[2], i32ty);
+    Value inStride3 = getAsValue(rewriter, loc, mixedStridesRev[3], i32ty);
+
+    // Hardware d0_size = inputSizes[0] * elemWidth / addrGran
+    // NOTE: Must multiply first, then divide to avoid integer truncation.
+    // For bf16 (elemWidth=16, addrGran=32): 32*16/32=16, NOT 32*(16/32)=0.
+    Value hwD0Size;
+    if (elemWidth == addrGran) {
+      hwD0Size = inSize0;
+    } else {
+      // Compute: inSize0 * elemWidth / addrGran
+      Value scaled =
+          arith::MulIOp::create(rewriter, loc, inSize0, cst(elemWidth));
+      hwD0Size = arith::DivUIOp::create(rewriter, loc, scaled, cst(addrGran));
     }
 
-    // out_of_order_id
-
-    if (!isLinear) {
-      // d0_size, d0_stride
-      d0_size = IntegerAttr::get(i32ty, sizes[0]);
-      d0_stride = IntegerAttr::get(i32ty, strides[0]);
-
-      // d1_size, d1_stride
-      d1_size = IntegerAttr::get(i32ty, sizes[1]);
-      d1_stride = IntegerAttr::get(i32ty, strides[1]);
+    // Hardware d0_stride: if elemWidth < addrGran or elemWidth > addrGran,
+    // stride = 0 (encoded as 1). Otherwise stride = inStride0 * elemWidth /
+    // addrGran - 1.
+    // For bf16 (elemWidth=16 < addrGran=32), hardware requires d0_stride=0
+    // because the address granularity exceeds the element width; the DMA
+    // always transfers full 32-bit words and cannot stride at sub-word level.
+    // The static verifier in NpuDmaMemcpyNdOp::verify() enforces this for
+    // constant strides; here in the dynamic path we set it unconditionally.
+    Value hwD0Stride;
+    if (elemWidth < addrGran || elemWidth > addrGran) {
+      hwD0Stride = cst(0);
+    } else {
+      // elemWidth == addrGran, so factor is 1
+      hwD0Stride = arith::SubIOp::create(rewriter, loc, inStride0, cst(1));
+    }
 
-      // d2_stride
-      d2_stride = IntegerAttr::get(i32ty, strides[2]);
+    Value zeroVal = cst(0);
+    Value oneVal = cst(1);
 
-      // d2_size
-      if (targetModel.isMemTile(tileCol, 0)) // Need to be any row
-        d2_size = IntegerAttr::get(i32ty, sizes[2]);
-      else
-        d2_size = IntegerAttr::get(i32ty, 0);
-    }
-    // iteration_current, iteration_size, iteration_stride, repeat_count
-    if (inputSizes[3] > 1) {
-      if (inputStrides[3] > 0) {
-        iteration_size = IntegerAttr::get(i32ty, sizes[3]);
-        iteration_stride = IntegerAttr::get(i32ty, strides[3]);
+    // d1_size = inputSizes[1] (no conversion)
+    Value hwD1Size = inSize1;
+    // d1_stride = inputStrides[1] * elemWidth / addrGran - 1
+    // Only meaningful when d1_size > 1; set to 0 otherwise (matching static).
+    Value hwD1Stride;
+    {
+      Value scaled;
+      if (elemWidth != addrGran) {
+        Value s =
+            arith::MulIOp::create(rewriter, loc, inStride1, cst(elemWidth));
+        scaled = arith::DivUIOp::create(rewriter, loc, s, cst(addrGran));
       } else {
-        // We allow users to encode the repeat_count as a dimension 3 stride
-        // of 0. This must lower to a iteration wrap of 0, so no stride is
-        // ever added. We then repeat the BD using the repeat_count in
-        // NpuPushQueueOp.
-        iteration_size = zero;
-        iteration_stride = zero;
+        scaled = inStride1;
       }
+      Value strideMinusOne =
+          arith::SubIOp::create(rewriter, loc, scaled, oneVal);
+      // Guard: if size1 <= 1, stride = 0
+      Value sizeGt1 = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, inSize1, oneVal);
+      hwD1Stride = arith::SelectOp::create(rewriter, loc, sizeGt1,
+                                           strideMinusOne, zeroVal);
     }
-    repeat_count = IntegerAttr::get(i32ty, sizes[3]);
-
-    // next_bd
-
-    // use_next_bd
-
-    // valid_bd
-    valid_bd = IntegerAttr::get(i32ty, 1);
-
-    // lock_rel_val
-
-    // lock_rel_id
-
-    // lock_acq_enable
-
-    // lock_acq_val
 
-    // lock_acq_id
-
-    // d0_zero_before
-    d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore());
-
-    // d1_zero_before
-    d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore());
-
-    // d2_zero_before
-    d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore());
-
-    // d0_zero_after
-    d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter());
-
-    // d1_zero_after
-    d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter());
+    // d2_size = inputSizes[2] (no conversion)
+    Value hwD2Size = inSize2;
+    // d2_stride = inputStrides[2] * elemWidth / addrGran - 1
+    // Only meaningful when d2_size > 1; set to 0 otherwise (matching static).
+    Value hwD2Stride;
+    {
+      Value scaled;
+      if (elemWidth != addrGran) {
+        Value s =
+            arith::MulIOp::create(rewriter, loc, inStride2, cst(elemWidth));
+        scaled = arith::DivUIOp::create(rewriter, loc, s, cst(addrGran));
+      } else {
+        scaled = inStride2;
+      }
+      Value strideMinusOne =
+          arith::SubIOp::create(rewriter, loc, scaled, oneVal);
+      // Guard: if size2 <= 1, stride = 0
+      Value sizeGt1 = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, inSize2, oneVal);
+      hwD2Stride = arith::SelectOp::create(rewriter, loc, sizeGt1,
+                                           strideMinusOne, zeroVal);
+    }
 
-    // d2_zero_after
-    d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter());
+    // iteration_size = inputSizes[3] - 1 (when > 1, else 0)
+    Value hwIterSize;
+    {
+      Value sizeMinusOne =
+          arith::SubIOp::create(rewriter, loc, inSize3, oneVal);
+      // Clamp to 0: if inSize3 <= 1, iteration_size = 0
+      Value sizeGt1 = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, inSize3, oneVal);
+      hwIterSize = arith::SelectOp::create(rewriter, loc, sizeGt1, sizeMinusOne,
+                                           zeroVal);
+    }
 
-    // burst_size
-    burst_length = IntegerAttr::get(i32ty, op.getBurstLength());
+    // iteration_stride = inputStrides[3] * elemWidth / addrGran - 1
+    // Only meaningful when size3 > 1 AND stride3 > 0.
+    Value hwIterStride;
+    {
+      Value scaled;
+      if (elemWidth != addrGran) {
+        Value s =
+            arith::MulIOp::create(rewriter, loc, inStride3, cst(elemWidth));
+        scaled = arith::DivUIOp::create(rewriter, loc, s, cst(addrGran));
+      } else {
+        scaled = inStride3;
+      }
+      Value strideMinusOne =
+          arith::SubIOp::create(rewriter, loc, scaled, oneVal);
+      // Guard: if size3 <= 1 or stride3 <= 0, both iterSize and iterStride = 0
+      Value sizeGt1 = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, inSize3, oneVal);
+      Value strideGt0 = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, inStride3, zeroVal);
+      Value active = arith::AndIOp::create(rewriter, loc, sizeGt1, strideGt0);
+      hwIterStride = arith::SelectOp::create(rewriter, loc, active,
+                                             strideMinusOne, zeroVal);
+      // Override iterSize to 0 when stride is 0 (repeat via push queue)
+      hwIterSize =
+          arith::SelectOp::create(rewriter, loc, active, hwIterSize, zeroVal);
+    }
 
-    // Set the issue_token
-    issue_token = BoolAttr::get(ctx, op.getIssueToken());
-    // Earlier, all S2MM channels were implicitly assumed to issue a token.
-    // This logic is kept for now for backward compatibility.
-    if (!isMM2S)
-      issue_token = BoolAttr::get(ctx, true);
+    // repeat_count for queue push = max(inputSizes[3] - 1, 0)
+    // The hardware queue push field encodes the number of *additional* repeats
+    // (i.e., total_count - 1), matching the static path's use of
+    // getHardwareStridesWraps which sets sizes[3] = inputSizes[3] - 1.
+    // Guard against underflow when sizes[3] == 0, matching the static path's
+    // `if (rcVal < 0) rcVal = 0` check.
+    Value rcSub = arith::SubIOp::create(rewriter, loc, inSize3, cst(1));
+    Value rcGtZero = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sgt, inSize3, cst(0));
+    Value repeatCount =
+        arith::SelectOp::create(rewriter, loc, rcGtZero, rcSub, cst(0));
+
+    // buffer_length = hwD0Size * d1_size * d2_size
+    Value bufLen = arith::MulIOp::create(rewriter, loc, hwD0Size, hwD1Size);
+    bufLen = arith::MulIOp::create(rewriter, loc, bufLen, hwD2Size);
+
+    // --- Compute BD base address ---
+    uint32_t bdId = op.getId();
+    uint64_t bdAddr = targetModel.getDmaBdAddress(tileCol, tileRow, bdId);
+
+    // --- Emit NpuWriteBdOp with placeholder values for dynamic fields,
+    //     then selective NpuWrite32Op overrides for dynamic BD words ---
+    // This produces the same blockwrite TXN format as the static path,
+    // with dynamic words patched via write32 overrides that the EmitC
+    // fusion can fold back into the blockwrite.
+
+    uint32_t burstEnc =
+        getShimBurstLengthEncoding(targetModel, op.getBurstLength());
+
+    // Helper: check if an OpFoldResult is a compile-time constant
+    auto isConst = [](OpFoldResult ofr) {
+      return getConstantIntValue(ofr).has_value();
+    };
+    // Helper: get constant value or 0 as placeholder
+    auto getConstOr0 = [](OpFoldResult ofr) -> int64_t {
+      if (auto v = getConstantIntValue(ofr))
+        return *v;
+      return 0;
+    };
+
+    // Determine which input sizes/strides are dynamic
+    bool d0SizeDyn = !isConst(mixedSizesRev[0]);
+    bool d1SizeDyn = !isConst(mixedSizesRev[1]);
+    bool d2SizeDyn = !isConst(mixedSizesRev[2]);
+    bool d3SizeDyn = !isConst(mixedSizesRev[3]);
+    bool d0StrideDyn = !isConst(mixedStridesRev[0]);
+    bool d1StrideDyn = !isConst(mixedStridesRev[1]);
+    bool d2StrideDyn = !isConst(mixedStridesRev[2]);
+    bool d3StrideDyn = !isConst(mixedStridesRev[3]);
+
+    // Compute static placeholder values for NpuWriteBdOp attributes.
+    // For constant fields, use the actual hardware value; for dynamic
+    // fields, use 0 as placeholder (will be overridden by write32).
+
+    // Compute static hardware values for constant fields (replicating
+    // getHardwareStridesWraps logic for constants only).
+    auto computeStaticHwD0Size = [&]() -> int64_t {
+      if (d0SizeDyn)
+        return 0;
+      int64_t s = getConstOr0(mixedSizesRev[0]);
+      return s * static_cast<int64_t>(elemWidth) /
+             static_cast<int64_t>(addrGran);
+    };
+    auto computeStaticHwD0Stride = [&]() -> int64_t {
+      if (d0StrideDyn)
+        return 0;
+      if (elemWidth < addrGran || elemWidth > addrGran)
+        return 0;
+      return getConstOr0(mixedStridesRev[0]) - 1;
+    };
+    auto computeStaticHwD1Size = [&]() -> int64_t {
+      return d1SizeDyn ? 0 : getConstOr0(mixedSizesRev[1]);
+    };
+    auto computeStaticHwD1Stride = [&]() -> int64_t {
+      if (d1StrideDyn || d1SizeDyn)
+        return 0;
+      int64_t s = getConstOr0(mixedStridesRev[1]);
+      int64_t sz = getConstOr0(mixedSizesRev[1]);
+      if (sz <= 1)
+        return 0;
+      int64_t scaled =
+          s * static_cast<int64_t>(elemWidth) / static_cast<int64_t>(addrGran);
+      return scaled - 1;
+    };
+    auto computeStaticHwD2Stride = [&]() -> int64_t {
+      if (d2StrideDyn || d2SizeDyn)
+        return 0;
+      int64_t s = getConstOr0(mixedStridesRev[2]);
+      int64_t sz = getConstOr0(mixedSizesRev[2]);
+      if (sz <= 1)
+        return 0;
+      int64_t scaled =
+          s * static_cast<int64_t>(elemWidth) / static_cast<int64_t>(addrGran);
+      return scaled - 1;
+    };
+    auto computeStaticIterSize = [&]() -> int64_t {
+      if (d3SizeDyn)
+        return 0;
+      int64_t s3 = getConstOr0(mixedSizesRev[3]);
+      if (s3 <= 1)
+        return 0;
+      if (!d3StrideDyn && getConstOr0(mixedStridesRev[3]) <= 0)
+        return 0;
+      return s3 - 1;
+    };
+    auto computeStaticIterStride = [&]() -> int64_t {
+      if (d3StrideDyn || d3SizeDyn)
+        return 0;
+      int64_t s3 = getConstOr0(mixedSizesRev[3]);
+      int64_t st3 = getConstOr0(mixedStridesRev[3]);
+      if (s3 <= 1 || st3 <= 0)
+        return 0;
+      int64_t scaled = st3 * static_cast<int64_t>(elemWidth) /
+                       static_cast<int64_t>(addrGran);
+      return scaled - 1;
+    };
+
+    // Compute static buffer_length for constant fields
+    int64_t staticBufLen = 0;
+    if (!d0SizeDyn && !d1SizeDyn && !d2SizeDyn) {
+      staticBufLen = computeStaticHwD0Size() * getConstOr0(mixedSizesRev[1]) *
+                     getConstOr0(mixedSizesRev[2]);
+    }
 
-    if (targetModel.isMemTile(tileCol, tileRow) && (!isMM2S) &&
-        (op.getD0ZeroBefore() != 0 || op.getD0ZeroAfter() != 0 ||
-         op.getD1ZeroBefore() != 0 || op.getD1ZeroAfter() != 0 ||
-         op.getD2ZeroBefore() != 0 || op.getD2ZeroAfter() != 0)) {
-      op->emitOpError("MemTile supports zero padding only on MM2S direction");
-      return failure();
+    // Build NpuWriteBdOp attrs
+    auto column = IntegerAttr::get(i32ty, tileCol);
+    auto bd_id = IntegerAttr::get(i32ty, bdId);
+    auto buffer_length = IntegerAttr::get(i32ty, staticBufLen);
+    auto buffer_offset = IntegerAttr::get(i32ty, 0);
+    auto enable_packet = zero;
+    auto out_of_order_id = zero;
+    auto packet_id = zero;
+    auto packet_type = zero;
+    if (auto packetInfo = op.getPacket()) {
+      enable_packet = IntegerAttr::get(i32ty, 1);
+      packet_type = IntegerAttr::get(i32ty, packetInfo->getPktType());
+      packet_id = IntegerAttr::get(i32ty, packetInfo->getPktId());
     }
+    auto d0_size = IntegerAttr::get(i32ty, computeStaticHwD0Size());
+    auto d0_stride = IntegerAttr::get(i32ty, computeStaticHwD0Stride());
+    auto d1_size = IntegerAttr::get(i32ty, computeStaticHwD1Size());
+    auto d1_stride = IntegerAttr::get(i32ty, computeStaticHwD1Stride());
+    auto d2_size = zero;
+    auto d2_stride = IntegerAttr::get(i32ty, computeStaticHwD2Stride());
+    auto iteration_current = zero;
+    auto iteration_size = IntegerAttr::get(i32ty, computeStaticIterSize());
+    auto iteration_stride = IntegerAttr::get(i32ty, computeStaticIterStride());
+    auto next_bd = zero;
+    auto row = IntegerAttr::get(i32ty, tileRow);
+    auto use_next_bd = zero;
+    auto valid_bd = IntegerAttr::get(i32ty, 1);
+    auto lock_rel_val = zero;
+    auto lock_rel_id = zero;
+    auto lock_acq_enable = zero;
+    auto lock_acq_val = zero;
+    auto lock_acq_id = zero;
+    auto d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore());
+    auto d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore());
+    auto d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore());
+    auto d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter());
+    auto d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter());
+    auto d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter());
+    auto burst_length = IntegerAttr::get(i32ty, op.getBurstLength());
 
-    // write the buffer descriptor to the array
     NpuWriteBdOp::create(
-        rewriter, op->getLoc(), column, bd_id, buffer_length, buffer_offset,
+        rewriter, loc, column, bd_id, buffer_length, buffer_offset,
         enable_packet, out_of_order_id, packet_id, packet_type, d0_size,
         d0_stride, d1_size, d1_stride, d2_size, d2_stride, iteration_current,
         iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd,
@@ -455,16 +977,150 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
         d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after,
         d1_zero_after, d2_zero_after, burst_length);
 
-    // compute the location of the address to patch in the bd and emit patch
-    // instruction to perform the patch.
-    uint64_t addr = targetModel.getDmaBdAddress(tileCol, tileRow, op.getId()) +
-                    targetModel.getDmaBdAddressOffset(tileCol, tileRow);
-    NpuAddressPatchOp::create(rewriter, op->getLoc(), addr, arg_idx, offset);
+    // --- Emit NpuWrite32Op overrides for dynamic BD words ---
+    // Only emit write32 for words that contain dynamic content.
+    // The address is the BD base + word_index * 4.
+    auto emitDynBdWord = [&](uint32_t wordIdx, Value wordValue) {
+      uint32_t wordAddr = static_cast<uint32_t>(bdAddr) + wordIdx * 4;
+      Value addrSSA = cst(wordAddr);
+      NpuWrite32Op::create(rewriter, loc,
+                           /*address=*/static_cast<uint32_t>(0),
+                           /*value=*/static_cast<uint32_t>(0),
+                           /*buffer=*/FlatSymbolRefAttr{},
+                           /*column=*/IntegerAttr{},
+                           /*row=*/IntegerAttr{},
+                           /*dyn_address=*/addrSSA,
+                           /*dyn_value=*/wordValue);
+    };
+
+    // word[0]: buffer_length — dynamic if any of d0/d1/d2 sizes are dynamic
+    bool word0Dyn = d0SizeDyn || d1SizeDyn || d2SizeDyn;
+    if (word0Dyn) {
+      emitDynBdWord(0, bufLen);
+    }
 
-    // push the patched bd onto the dma task queue
-    NpuPushQueueOp::create(
-        rewriter, op->getLoc(), column, row, infoOp.getChannelDirAttr(),
-        infoOp.getChannelIndexAttr(), issue_token, repeat_count, bd_id);
+    // word[3]: d0_size, d0_stride — dynamic if either is dynamic
+    bool word3Dyn = d0SizeDyn || d0StrideDyn;
+    if (word3Dyn) {
+      emitDynBdWord(
+          3, buildBdWord(rewriter, loc,
+                         {{hwD0Size, 0x3FF, 20}, {hwD0Stride, 0xFFFFF, 0}}));
+    }
+
+    // word[4]: burst_length (static), d1_size, d1_stride
+    bool word4Dyn = d1SizeDyn || d1StrideDyn;
+    if (word4Dyn) {
+      Value burstVal = cst((burstEnc & 0x3) << 30);
+      Value sizeStride = buildBdWord(
+          rewriter, loc, {{hwD1Size, 0x3FF, 20}, {hwD1Stride, 0xFFFFF, 0}});
+      emitDynBdWord(4,
+                    arith::OrIOp::create(rewriter, loc, burstVal, sizeStride));
+    }
+
+    // word[5]: AXCache (static), d2_stride
+    bool word5Dyn = d2StrideDyn || d2SizeDyn;
+    if (word5Dyn) {
+      Value axcache = cst((2u & 0xf) << 24);
+      Value strMasked = buildBdWord(rewriter, loc, {{hwD2Stride, 0xFFFFF, 0}});
+      emitDynBdWord(5, arith::OrIOp::create(rewriter, loc, axcache, strMasked));
+    }
+
+    // word[6]: iteration_size, iteration_stride
+    bool word6Dyn = d3SizeDyn || d3StrideDyn;
+    if (word6Dyn) {
+      emitDynBdWord(
+          6, buildBdWord(rewriter, loc,
+                         {{hwIterSize, 0x3F, 20}, {hwIterStride, 0xFFFFF, 0}}));
+    }
+
+    // word[1] (base_addr) and word[2] (packet ctrl) are always static
+    // word[7] (valid_bd) is always static
+    // These are fully handled by the NpuWriteBdOp above.
+
+    // --- Address patch ---
+    uint64_t patchAddr =
+        bdAddr + targetModel.getDmaBdAddressOffset(tileCol, tileRow);
+    if (dynOffset) {
+      // Dynamic offset: pass 0 as static arg_plus and provide SSA value
+      NpuAddressPatchOp::create(rewriter, loc, static_cast<uint32_t>(patchAddr),
+                                static_cast<uint32_t>(arg_idx),
+                                /*arg_plus=*/static_cast<uint32_t>(0),
+                                /*dyn_arg_plus=*/dynOffset);
+    } else {
+      NpuAddressPatchOp::create(rewriter, loc, static_cast<uint32_t>(patchAddr),
+                                static_cast<uint32_t>(arg_idx),
+                                static_cast<uint32_t>(staticOffset),
+                                /*dyn_arg_plus=*/Value{});
+    }
+
+    // --- Queue push ---
+    // Determine issue_token
+    bool issueTokenVal = op.getIssueToken();
+    if (!isMM2S)
+      issueTokenVal = true;
+
+    // Check if repeat_count (sizes[3]) is dynamic
+    bool repeatCountDynamic =
+        !getConstantIntValue(mixedSizesRev[3]).has_value();
+
+    // Compute the queue push address
+    uint32_t ctrlOffset = targetModel.getDmaControlAddress(
+        tileCol, tileRow, infoOp.getChannelIndex(), channelDir);
+
+    uint32_t queueOffset = ctrlOffset + 0x4;
+
+    if (repeatCountDynamic) {
+      // Dynamic-repeat-count path lowers directly to NpuWrite32Op below,
+      // bypassing PushQueuetoWrite32Pattern. Emit the controller_id
+      // task-complete-token maskwrite inline here. (In the static `else`
+      // branch we create an NpuPushQueueOp; PushQueuetoWrite32Pattern
+      // emits the controller_id maskwrite for that case, so emitting it
+      // here too would duplicate the op.)
+      if (issueTokenVal) {
+        if (shimTile->hasAttr("controller_id")) {
+          AIE::PacketInfoAttr controllerIdAttr =
+              shimTile->getAttrOfType<AIE::PacketInfoAttr>("controller_id");
+          uint32_t data = controllerIdAttr.getPktId() << 8;
+          uint32_t mask = 0x00001F00;
+          NpuMaskWrite32Op::create(rewriter, loc, ctrlOffset, data, mask,
+                                   nullptr, nullptr, nullptr);
+        }
+      }
+
+      // Build queue push command as SSA:
+      // cmd = (bd_id & 0xF) | ((repeat_count & 0xFF) << 16) |
+      //       (issue_token ? 1<<31 : 0)
+      Value bdIdVal = cst(bdId & 0xF);
+      Value rcShifted = buildBdWord(rewriter, loc, {{repeatCount, 0xFF, 16}});
+      Value cmd = arith::OrIOp::create(rewriter, loc, bdIdVal, rcShifted);
+      if (issueTokenVal) {
+        Value tokenBit = cst(static_cast<int32_t>(0x80000000u));
+        cmd = arith::OrIOp::create(rewriter, loc, cmd, tokenBit);
+      }
+      Value queueAddrSSA = cst(queueOffset);
+      NpuWrite32Op::create(rewriter, loc, rewriter.getUI32IntegerAttr(0),
+                           rewriter.getUI32IntegerAttr(0),
+                           /*buffer=*/FlatSymbolRefAttr(),
+                           /*column=*/IntegerAttr(),
+                           /*row=*/IntegerAttr(),
+                           /*dyn_address=*/queueAddrSSA, /*dyn_value=*/cmd);
+    } else {
+      // Static queue push
+      auto columnAttr = IntegerAttr::get(i32ty, tileCol);
+      auto rowAttr = IntegerAttr::get(i32ty, tileRow);
+      auto bdIdAttr = IntegerAttr::get(i32ty, bdId);
+      auto issueTokenAttr = BoolAttr::get(ctx, issueTokenVal);
+      // repeat_count is constant here. Apply the same -1 conversion
+      // as getHardwareStridesWraps (hardware encodes additional repeats).
+      int64_t rcVal = getConstantIntValue(mixedSizesRev[3]).value() - 1;
+      if (rcVal < 0)
+        rcVal = 0;
+      auto repeatCountAttr = IntegerAttr::get(i32ty, rcVal);
+      NpuPushQueueOp::create(rewriter, loc, columnAttr, rowAttr,
+                             infoOp.getChannelDirAttr(),
+                             infoOp.getChannelIndexAttr(), issueTokenAttr,
+                             repeatCountAttr, bdIdAttr);
+    }
 
     rewriter.eraseOp(op);
     return success();
@@ -710,6 +1366,7 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
 struct AIEDmaToNpuPass : xilinx::AIEX::impl::AIEDmaToNpuBase<AIEDmaToNpuPass> {
 
   void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
     registry.insert<memref::MemRefDialect>();
   }
 
@@ -719,6 +1376,7 @@ struct AIEDmaToNpuPass : xilinx::AIEX::impl::AIEDmaToNpuBase<AIEDmaToNpuPass> {
 
     ConversionTarget target(getContext());
     target.addLegalDialect<AIEXDialect>();
+    target.addLegalDialect<arith::ArithDialect>();
     target.addLegalDialect<memref::MemRefDialect>();
     target.addLegalOp<AIE::BufferOp>();
     target.addLegalOp<AIE::ShimDMAAllocationOp>();
@@ -746,8 +1404,12 @@ struct AIEDmaToNpuPass : xilinx::AIEX::impl::AIEDmaToNpuBase<AIEDmaToNpuPass> {
     patterns.insert<Write32SymToAddr>(&getContext());
     patterns.insert<WriteBdToBlockWritePattern>(&getContext());
 
-    if (failed(applyPartialConversion(device, target, std::move(patterns))))
+    FrozenRewritePatternSet frozenPatterns(std::move(patterns));
+
+    if (failed(applyPartialConversion(device, target, frozenPatterns))) {
       signalPassFailure();
+      return;
+    }
   }
 };
 
diff --git a/lib/Dialect/AIEX/Transforms/AIELowerSetLock.cpp b/lib/Dialect/AIEX/Transforms/AIELowerSetLock.cpp
index b3f9cd88ede..2cba804c947 100644
--- a/lib/Dialect/AIEX/Transforms/AIELowerSetLock.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIELowerSetLock.cpp
@@ -78,6 +78,7 @@ struct AIELowerSetLockPass
 
     if (failed(applyPartialConversion(device, target, std::move(patterns)))) {
       signalPassFailure();
+      return;
     }
   }
 };
diff --git a/lib/Dialect/AIEX/Transforms/AIEMaterializeBDChains.cpp b/lib/Dialect/AIEX/Transforms/AIEMaterializeBDChains.cpp
index c9dc528dc8c..3dc24930399 100644
--- a/lib/Dialect/AIEX/Transforms/AIEMaterializeBDChains.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEMaterializeBDChains.cpp
@@ -128,6 +128,7 @@ struct AIEMaterializeBDChainsPass
     GreedyRewriteConfig rewriter_config = GreedyRewriteConfig();
     rewriter_config.setRegionSimplificationLevel(
         GreedySimplifyRegionLevel::Disabled);
+    rewriter_config.enableConstantCSE(false);
 
     RewritePatternSet patterns_0(ctx);
     patterns_0.insert<DMAStartBdChainForOpPattern>(ctx);
diff --git a/lib/Dialect/AIEX/Transforms/CMakeLists.txt b/lib/Dialect/AIEX/Transforms/CMakeLists.txt
index fb205cb4048..e98db1462e3 100644
--- a/lib/Dialect/AIEX/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AIEX/Transforms/CMakeLists.txt
@@ -37,6 +37,7 @@ add_mlir_dialect_library(AIEXTransforms
 
   LINK_LIBS PUBLIC
   AIE
+  AIETransforms
   MLIRAIEToConfiguration
   AIEXUtils
   MLIRIR
diff --git a/lib/Targets/AIENpuLowering.cpp b/lib/Targets/AIENpuLowering.cpp
new file mode 100644
index 00000000000..e202a81afa3
--- /dev/null
+++ b/lib/Targets/AIENpuLowering.cpp
@@ -0,0 +1,37 @@
+//===- AIENpuLowering.cpp - Shared NPU lowering pipeline --------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "aie/Targets/AIENpuLowering.h"
+
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace xilinx::AIE {
+
+void populateNpuLoweringPipeline(PassManager &pm, bool skipMaterialize) {
+  if (!skipMaterialize)
+    pm.addPass(AIEX::createAIEMaterializeRuntimeSequencesPass());
+
+  OpPassManager &devicePm = pm.nest<DeviceOp>();
+  devicePm.addPass(AIEX::createAIEMaterializeBDChainsPass());
+  devicePm.addPass(AIEX::createAIESubstituteShimDMAAllocationsPass());
+  devicePm.addPass(AIEX::createAIEAssignRuntimeSequenceBDIDsPass());
+  devicePm.addPass(createCanonicalizerPass());
+  devicePm.addPass(AIEX::createAIEDMATasksToNPUPass());
+  devicePm.addPass(AIEX::createAIEDmaToNpuPass());
+  devicePm.addPass(AIEX::createAIELowerSetLockPass());
+}
+
+} // namespace xilinx::AIE
diff --git a/lib/Targets/AIETargetBCF.cpp b/lib/Targets/AIETargetBCF.cpp
index 4148841a53d..445de588406 100644
--- a/lib/Targets/AIETargetBCF.cpp
+++ b/lib/Targets/AIETargetBCF.cpp
@@ -143,12 +143,14 @@ LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output,
         if (auto filesAttr = coreOp.getLinkFiles()) {
           // Canonical path: link_files populated by aie-assign-core-link-files.
           for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
-            output << "_include _file " << f.getValue() << "\n";
+            if (!f.getValue().empty())
+              output << "_include _file " << f.getValue() << "\n";
         } else if (coreOp.getLinkWith()) {
           // Deprecated fallback: core-level link_with was not migrated by
           // aie-assign-core-link-files (e.g., the pass was not run).
-          output << "_include _file " << coreOp.getLinkWith().value().str()
-                 << "\n";
+          if (!coreOp.getLinkWith().value().empty())
+            output << "_include _file " << coreOp.getLinkWith().value().str()
+                   << "\n";
         }
       }
       output << "_resolve _main core_" << tile.getCol() << "_" << tile.getRow()
diff --git a/lib/Targets/AIETargetCppTxn.cpp b/lib/Targets/AIETargetCppTxn.cpp
new file mode 100644
index 00000000000..32ddce6e830
--- /dev/null
+++ b/lib/Targets/AIETargetCppTxn.cpp
@@ -0,0 +1,72 @@
+//===- AIETargetCppTxn.cpp - EmitC-based C++ TXN translation ------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Translates MLIR runtime sequences to compilable C++ code by:
+// 1. Running the NPU lowering pipeline (same as aiecc) to lower high-level
+//    DMA task ops to npu.write32/blockwrite/sync/address_patch
+// 2. Running ConvertAIEXToEmitCPass to lower those to EmitC dialect
+// 3. Calling translateToCpp() to emit C++ from EmitC IR
+//
+// The generated C++ code #includes TxnEncoding.h and calls its functions to
+// build TXN instruction binaries at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "aie/Targets/AIETargets.h"
+
+#include "aie/Conversion/AIEXToEmitC/AIEXToEmitC.h"
+#include "aie/Targets/AIENpuLowering.h"
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/Cpp/CppEmitter.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace xilinx {
+namespace AIE {
+
+LogicalResult AIETranslateToCppTxn(ModuleOp module, llvm::raw_ostream &output) {
+  // Clone the module so we don't mutate the original.
+  OwningOpRef<ModuleOp> clonedModule = module.clone();
+  auto *ctx = clonedModule->getContext();
+
+  // Step 1: Run NPU lowering pipeline to lower high-level DMA task ops
+  // (dma_configure_task_for, dma_start_task, dma_await_task, etc.)
+  // down to npu.write32/blockwrite/sync/address_patch.
+  {
+    PassManager pm(ctx);
+    // Skip materialize pass: the runtime_sequence is already in final form
+    // (no aiex.run calls to inline) and this translation path only needs the
+    // shared NPU lowering pipeline before EmitC conversion.
+    populateNpuLoweringPipeline(pm, /*skipMaterialize=*/true);
+
+    if (failed(pm.run(*clonedModule)))
+      return module.emitError("NPU lowering pipeline failed");
+  }
+
+  // Step 2: Run the AIEX-to-EmitC conversion pass.
+  {
+    PassManager pm(ctx);
+    pm.addPass(createConvertAIEXToEmitCPass());
+    if (failed(pm.run(*clonedModule)))
+      return module.emitError("Failed to convert AIEX to EmitC");
+  }
+
+  // Step 3: Translate EmitC IR to C++.
+  return emitc::translateToCpp(*clonedModule, output,
+                               /*declareVariablesAtTop=*/false);
+}
+
+} // namespace AIE
+} // namespace xilinx
diff --git a/lib/Targets/AIETargetLdScript.cpp b/lib/Targets/AIETargetLdScript.cpp
index 1cf87059c3f..8317b797f7d 100644
--- a/lib/Targets/AIETargetLdScript.cpp
+++ b/lib/Targets/AIETargetLdScript.cpp
@@ -182,11 +182,13 @@ SECTIONS
         if (auto filesAttr = coreOp.getLinkFiles()) {
           // Canonical path: link_files populated by aie-assign-core-link-files.
           for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
-            output << "INPUT(" << f.getValue() << ")\n";
+            if (!f.getValue().empty())
+              output << "INPUT(" << f.getValue() << ")\n";
         } else if (auto fileAttr = coreOp.getLinkWith()) {
           // Deprecated fallback: core-level link_with was not migrated by
           // aie-assign-core-link-files (e.g., the pass was not run).
-          output << "INPUT(" << fileAttr.value().str() << ")\n";
+          if (!fileAttr.value().empty())
+            output << "INPUT(" << fileAttr.value() << ")\n";
         }
 
         output << "PROVIDE(main = core_" << tile.getCol() << "_"
diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp
index abdff114091..da6962e08ce 100644
--- a/lib/Targets/AIETargetNPU.cpp
+++ b/lib/Targets/AIETargetNPU.cpp
@@ -12,6 +12,7 @@
 
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
+#include "aie/Runtime/TxnEncoding.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -23,42 +24,6 @@
 
 #include <vector>
 
-extern "C" {
-// #include "xaiengine/xaie_txn.h"
-// see aie-rt commit a6196eb, xaiengine/xaie_txn.h for source of this enum
-typedef enum {
-  XAIE_IO_WRITE,
-  XAIE_IO_BLOCKWRITE,
-  XAIE_IO_BLOCKSET,
-  XAIE_IO_MASKWRITE,
-  XAIE_IO_MASKPOLL,
-  XAIE_IO_NOOP,
-  XAIE_IO_PREEMPT,
-  XAIE_IO_MASKPOLL_BUSY,
-  XAIE_IO_LOADPDI,
-  XAIE_IO_LOAD_PM_START,
-  XAIE_IO_CREATE_SCRATCHPAD,
-  XAIE_IO_UPDATE_STATE_TABLE,
-  XAIE_IO_UPDATE_REG,
-  XAIE_IO_UPDATE_SCRATCH,
-  XAIE_CONFIG_SHIMDMA_BD,
-  XAIE_CONFIG_SHIMDMA_DMABUF_BD,
-  XAIE_IO_CUSTOM_OP_BEGIN = 1U << 7U,
-  XAIE_IO_CUSTOM_OP_TCT = XAIE_IO_CUSTOM_OP_BEGIN,
-  XAIE_IO_CUSTOM_OP_DDR_PATCH, // Previously this was XAIE_IO_CUSTOM_OP_BEGIN +
-                               // 1
-  XAIE_IO_CUSTOM_OP_READ_REGS, // Previously this was XAIE_IO_CUSTOM_OP_BEGIN +
-                               // 2
-  XAIE_IO_CUSTOM_OP_RECORD_TIMER, // Previously this was XAIE_IO_CUSTOM_OP_BEGIN
-                                  // + 3
-  XAIE_IO_CUSTOM_OP_MERGE_SYNC, // Previously this was XAIE_IO_CUSTOM_OP_BEGIN +
-                                // 4
-  XAIE_IO_CUSTOM_OP_NEXT,
-  XAIE_IO_LOAD_PM_END_INTERNAL = 200,
-  XAIE_IO_CUSTOM_OP_MAX = UCHAR_MAX,
-} XAie_TxnOpcode;
-}
-
 using namespace mlir;
 using namespace xilinx;
 using namespace xilinx::AIE;
@@ -66,11 +31,8 @@ using namespace xilinx::AIEX;
 
 namespace {
 
-// Example:
-// - instructions = {3,4,5}
-// - tailSize = 2
-// instructions becomes {3,4,5,0,0} and
-// a mutable reference to the tail {0,0} is returned.
+// Helper function for reserving space in instruction vector (still used by
+// appendBlockWrite and control packet translation below).
 llvm::MutableArrayRef<uint32_t>
 reserveAndGetTail(std::vector<uint32_t> &instructions, uint64_t tailSize) {
   auto oldSize = instructions.size();
@@ -80,123 +42,83 @@ reserveAndGetTail(std::vector<uint32_t> &instructions, uint64_t tailSize) {
                                          tailSize);
 }
 
-void appendSync(std::vector<uint32_t> &instructions, NpuSyncOp op) {
-
-  auto words = reserveAndGetTail(instructions, 4);
-
-  // XAIE_IO_CUSTOM_OP_TCT
-  words[0] = XAIE_IO_CUSTOM_OP_TCT;
-
-  words[1] = words.size() * sizeof(uint32_t); // Operation Size
+// Thin wrappers that extract MLIR attributes and delegate to TxnEncoding.h.
 
-  words[2] |= static_cast<uint32_t>(op.getDirection()) & 0xff;
-  words[2] |= (op.getRow() & 0xff) << 8;
-  words[2] |= (op.getColumn() & 0xff) << 16;
-
-  words[3] |= (op.getRowNum() & 0xff) << 8;
-  words[3] |= (op.getColumnNum() & 0xff) << 16;
-  words[3] |= (op.getChannel() & 0xff) << 24;
+LogicalResult appendSync(std::vector<uint32_t> &instructions, NpuSyncOp op) {
+  if (op.hasDynamicOperands())
+    return op.emitOpError("cannot translate dynamic operands to binary; use "
+                          "--aie-generate-txn-cpp instead");
+  aie_runtime::txn_append_sync(instructions, op.getColumn(), op.getRow(),
+                               static_cast<uint32_t>(op.getDirection()),
+                               op.getChannel(), op.getColumnNum(),
+                               op.getRowNum());
+  return success();
 }
 
-void appendWrite32(std::vector<uint32_t> &instructions, NpuWrite32Op op) {
-
-  auto words = reserveAndGetTail(instructions, 6);
-
-  if (op.getBuffer()) {
-    op.emitOpError("Cannot translate symbolic address");
-    return;
-  }
-
-  // XAIE_IO_WRITE
-  words[0] = XAIE_IO_WRITE;
-  words[2] = *op.getAbsoluteAddress();
-  words[3] = 0;                               // Extra bits for Reg Offset
-  words[4] = op.getValue();                   // Value
-  words[5] = words.size() * sizeof(uint32_t); // Operation Size
+LogicalResult appendWrite32(std::vector<uint32_t> &instructions,
+                            NpuWrite32Op op) {
+  if (op.hasDynamicOperands())
+    return op.emitOpError("cannot translate dynamic operands to binary; use "
+                          "--aie-generate-txn-cpp instead");
+  if (op.getBuffer())
+    return op.emitOpError("Cannot translate symbolic address");
+  aie_runtime::txn_append_write32(instructions, *op.getAbsoluteAddress(),
+                                  op.getValue());
+  return success();
 }
 
-void appendMaskWrite32(std::vector<uint32_t> &instructions,
-                       NpuMaskWrite32Op op) {
-
-  auto words = reserveAndGetTail(instructions, 7);
-
-  if (op.getBuffer()) {
-    op.emitOpError("Cannot translate symbolic address");
-    return;
-  }
-
-  // XAIE_IO_MASKWRITE
-  words[0] = XAIE_IO_MASKWRITE;
-  words[2] = *op.getAbsoluteAddress();
-  words[3] = 0;
-  words[4] = op.getValue();                   // Value
-  words[5] = op.getMask();                    // Mask
-  words[6] = words.size() * sizeof(uint32_t); // Operation Size
+LogicalResult appendMaskWrite32(std::vector<uint32_t> &instructions,
+                                NpuMaskWrite32Op op) {
+  if (op.hasDynamicOperands())
+    return op.emitOpError("cannot translate dynamic operands to binary; use "
+                          "--aie-generate-txn-cpp instead");
+  if (op.getBuffer())
+    return op.emitOpError("Cannot translate symbolic address");
+  aie_runtime::txn_append_maskwrite32(instructions, *op.getAbsoluteAddress(),
+                                      op.getValue(), op.getMask());
+  return success();
 }
 
 void appendLoadPdi(std::vector<uint32_t> &instructions, NpuLoadPdiOp op) {
-
-  auto words = reserveAndGetTail(instructions, 4);
-
-  // XAIE_IO_LOADPDI
-  words[0] = XAIE_IO_LOADPDI;
-  words[0] |= op.getId() << 16;
-  std::optional<uint32_t> size = op.getSize();
-  if (size)
-    words[1] = *size;
-  std::optional<uint64_t> address = op.getAddress();
-  if (address) {
-    words[2] = *address;
-    words[3] = *address >> 32;
-  }
+  aie_runtime::txn_append_loadpdi(instructions, op.getId(), op.getSize(),
+                                  op.getAddress());
 }
 
 void appendAddressPatch(std::vector<uint32_t> &instructions,
                         NpuAddressPatchOp op) {
-
-  auto words = reserveAndGetTail(instructions, 12);
-
-  // XAIE_IO_CUSTOM_OP_DDR_PATCH
-  words[0] = XAIE_IO_CUSTOM_OP_DDR_PATCH;
-  words[1] = words.size() * sizeof(uint32_t); // Operation Size
-
-  words[5] = 0; // Action
-
-  words[6] = op.getAddr();
-
-  words[8] = op.getArgIdx();
-
-  words[10] = op.getArgPlus();
+  aie_runtime::txn_append_address_patch(instructions, op.getAddr(),
+                                        op.getArgIdx(),
+                                        static_cast<uint32_t>(op.getArgPlus()));
 }
 
-void appendBlockWrite(std::vector<uint32_t> &instructions, NpuBlockWriteOp op) {
-  unsigned payload_start = 4;
-
+LogicalResult appendBlockWrite(std::vector<uint32_t> &instructions,
+                               NpuBlockWriteOp op) {
   std::optional<uint32_t> address = op.getAbsoluteAddress();
   DenseIntElementsAttr data = op.getDataWords();
 
-  auto words = reserveAndGetTail(instructions, data.size() + payload_start);
+  // Extract payload into a temporary buffer.
+  std::vector<uint32_t> payload;
+  payload.reserve(data.size());
+  for (auto d : data)
+    payload.push_back(d.getZExtValue());
 
-  // XAIE_IO_BLOCKWRITE
-  words[0] = XAIE_IO_BLOCKWRITE;
-  words[2] = op.getAddress();
+  // Use encoding library for the core format, then fix up col/row field.
+  aie_runtime::txn_append_blockwrite(instructions, *address, payload.data(),
+                                     payload.size());
+
+  // The encoding library leaves word[1] as 0. If col/row are present, set it.
   auto col = op.getColumn();
   auto row = op.getRow();
   if (col && row) {
-    words[1] = (*col & 0xff) | ((*row & 0xff) << 8);
+    // word[1] is at position (current_size - headerSize - count + 1)
+    size_t headerPos = instructions.size() - 4 - payload.size();
+    instructions[headerPos + 1] = (*col & 0xff) | ((*row & 0xff) << 8);
   }
-  words[2] = *address;
-  words[3] = words.size() * sizeof(uint32_t); // Operation Size
-
-  unsigned i = payload_start;
-  for (auto d : data)
-    words[i++] = d.getZExtValue();
+  return success();
 }
 
 void appendPreempt(std::vector<uint32_t> &instructions, NpuPreemptOp op) {
-
-  auto words = reserveAndGetTail(instructions, 1);
-  words[0] = XAIE_IO_PREEMPT | (op.getLevel() << 8);
+  aie_runtime::txn_append_preempt(instructions, op.getLevel());
 }
 
 } // namespace
@@ -211,22 +133,16 @@ LogicalResult xilinx::AIE::AIETranslateNpuToBinary(
     return failure();
   }
 
-  auto words = reserveAndGetTail(instructions, 4);
-
   const AIETargetModel &tm = deviceOp.getTargetModel();
 
-  // setup txn header
-  uint8_t major = 0;
-  uint8_t minor = 1;
-  uint8_t devGen = 3; // NPU (PHX HWK)
-  if (llvm::isa<AIE::BaseNPU2TargetModel>(tm))
-    devGen = 4; // NPU2 (STX KRK)
-  uint8_t numRows = tm.rows();
-  uint8_t numCols = tm.columns();
-  uint8_t numMemTileRows = tm.getNumMemTileRows();
-  uint32_t count = 0;
-  words[0] = (numRows << 24) | (devGen << 16) | (minor << 8) | major;
-  words[1] = (numMemTileRows << 8) | numCols;
+  // Build device info for the TXN header.
+  aie_runtime::TxnDeviceInfo devInfo;
+  devInfo.major = 0;
+  devInfo.minor = 1;
+  devInfo.devGen = llvm::isa<AIE::BaseNPU2TargetModel>(tm) ? 4 : 3;
+  devInfo.numRows = tm.rows();
+  devInfo.numCols = tm.columns();
+  devInfo.numMemTileRows = tm.getNumMemTileRows();
 
   AIE::RuntimeSequenceOp seq =
       AIE::RuntimeSequenceOp::getForSymbolInDeviceOrError(deviceOp,
@@ -234,24 +150,34 @@ LogicalResult xilinx::AIE::AIETranslateNpuToBinary(
   if (!seq) {
     return failure();
   }
+
+  // Reserve header space up front so txn_prepend_header can overwrite in-place.
+  aie_runtime::txn_init(instructions);
+
+  uint32_t count = 0;
+  LogicalResult result = success();
   for (Block &block : seq.getBody()) {
     for (Operation &o : block) {
       llvm::TypeSwitch<Operation *>(&o)
           .Case<NpuSyncOp>([&](auto op) {
             count++;
-            appendSync(instructions, op);
+            if (failed(appendSync(instructions, op)))
+              result = failure();
           })
           .Case<NpuWrite32Op>([&](auto op) {
             count++;
-            appendWrite32(instructions, op);
+            if (failed(appendWrite32(instructions, op)))
+              result = failure();
           })
           .Case<NpuBlockWriteOp>([&](auto op) {
             count++;
-            appendBlockWrite(instructions, op);
+            if (failed(appendBlockWrite(instructions, op)))
+              result = failure();
           })
           .Case<NpuMaskWrite32Op>([&](auto op) {
             count++;
-            appendMaskWrite32(instructions, op);
+            if (failed(appendMaskWrite32(instructions, op)))
+              result = failure();
           })
           .Case<NpuLoadPdiOp>([&](auto op) {
             count++;
@@ -267,10 +193,11 @@ LogicalResult xilinx::AIE::AIETranslateNpuToBinary(
           });
     }
   }
+  if (failed(result))
+    return failure();
 
-  // write size fields of the txn header
-  instructions[2] = count;
-  instructions[3] = instructions.size() * sizeof(uint32_t); // size of the txn
+  // Finalize the TXN header (overwrites the 4 reserved words).
+  aie_runtime::txn_prepend_header(instructions, count, devInfo);
   return success();
 }
 
diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp
index fa771150368..20153b411b9 100644
--- a/lib/Targets/AIETargets.cpp
+++ b/lib/Targets/AIETargets.cpp
@@ -409,5 +409,13 @@ void registerAIETranslations() {
         return AIETranslateToUcDma(module, output);
       },
       registerDialects);
+
+  TranslateFromMLIRRegistration registrationCppTxn(
+      "aie-generate-txn-cpp",
+      "Generate C++ code for runtime TXN binary generation",
+      [](ModuleOp module, raw_ostream &output) {
+        return AIETranslateToCppTxn(module, output);
+      },
+      registerDialects);
 }
 } // namespace xilinx::AIE
diff --git a/lib/Targets/CMakeLists.txt b/lib/Targets/CMakeLists.txt
index 0af04558a6b..0deb5548ee5 100644
--- a/lib/Targets/CMakeLists.txt
+++ b/lib/Targets/CMakeLists.txt
@@ -40,8 +40,10 @@ add_dependencies(obj.AIERT xaienginecdo_static xaienginecdo_static-headers)
 
 add_mlir_library(AIETargets
   AIETargets.cpp
+  AIENpuLowering.cpp
   AIETargetBCF.cpp
   AIETargetCDODirect.cpp
+  AIETargetCppTxn.cpp
   AIETargetNPU.cpp
   AIETargetLdScript.cpp
   AIETargetUcCert.cpp
@@ -84,9 +86,14 @@ add_mlir_library(AIETargets
   LINK_LIBS PUBLIC
   AIERT
   AIE
+  AIETransforms
   AIEX
+  AIEXTransforms
   AIEXUtils
   ADF
+  MLIRAIEXToEmitC
+  MLIREmitCDialect
+  MLIRTargetCpp
 )
 
 target_link_libraries(AIETargets PRIVATE xaienginecdo_static)
diff --git a/programming_examples/basic/matrix_multiplication/single_core/single_core_iron.py b/programming_examples/basic/matrix_multiplication/single_core/single_core_iron.py
index aaa6574fa7e..4fe0f11c370 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/single_core_iron.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/single_core_iron.py
@@ -12,7 +12,6 @@
 from aie.iron.controlflow import range_
 from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/Makefile b/programming_examples/basic/matrix_multiplication/single_core_dynamic/Makefile
new file mode 100644
index 00000000000..95cbfb99b39
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/Makefile
@@ -0,0 +1,92 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
+#
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+subdir=single_core_dynamic
+targetname=single_core_dynamic
+
+# Dynamic design always uses bf16->f32, m=k=n=32
+m=32
+k=32
+n=32
+M?=128
+K?=128
+N?=128
+dtype_in=bf16
+dtype_out=f32
+devicename?=npu2
+
+kernels=mm_${m}x${k}x${n}
+
+use_placed?=0
+use_iron?=0
+
+ifeq (${use_placed}, 1)
+aie_py_src=single_core_dynamic_placed.py
+else ifeq (${use_iron}, 1)
+aie_py_src=single_core_dynamic_iron.py
+else
+aie_py_src=single_core_dynamic.py
+endif
+
+KERNEL_DEFINES=-D${dtype_in}_${dtype_out}_ONLY -DDIM_M=${m} -DDIM_K=${k} -DDIM_N=${n}
+
+include ${srcdir}/../makefile-common
+
+build/mm_${m}x${k}x${n}.o: ${kernels_dir}/mm.cc
+	mkdir -p ${@D}
+	cd ${@D} && ${KERNEL_CC} ${KERNEL_CFLAGS} ${KERNEL_DEFINES} -c $< -o ${@F}
+
+# --- Auto-generated TXN targets (via aiecc --aie-generate-txn-cpp) ---
+
+MLIR_AIE_ROOT ?= $(if $(MLIR_AIE_INSTALL_DIR),$(MLIR_AIE_INSTALL_DIR),$(shell cd ${srcdir}/../../../.. && pwd))
+RUNTIME_LIB_ROOT ?= $(if $(MLIR_AIE_INSTALL_DIR),${MLIR_AIE_ROOT}/runtime_lib/x86_64,${MLIR_AIE_ROOT}/runtime_lib)
+XRT_INC_DIR ?= /opt/xilinx/xrt/include
+XRT_LIB_DIR ?= /opt/xilinx/xrt/lib
+TEST_UTILS_DIR ?= $(if $(MLIR_AIE_INSTALL_DIR),${RUNTIME_LIB_ROOT}/test_lib/include,${RUNTIME_LIB_ROOT}/test_lib)
+
+# Generate dynamic MLIR with SSA M/K/N parameters in runtime_sequence
+build/dynsize/aie_gemm_dynamic.mlir: ${srcdir}/single_core_dynamic.py
+	mkdir -p ${@D}
+	python3 $< --dev ${devicename} -M ${M} -K ${K} -N ${N} -m ${m} -k ${k} -n ${n} \
+	    --dtype_in ${dtype_in} --dtype_out ${dtype_out} > $@
+
+# Build XCLBIN and generate C++ TXN in one unified aiecc invocation.
+# The aiecc SCF→CF lowering explicitly keeps aie.runtime_sequence legal, so
+# SCF is preserved for EmitC while core bodies are lowered for codegen.
+# Both XCLBIN and TXN use identical buffer addresses from the same MLIR.
+build/dynsize/final_dynamic.xclbin build/dynsize/generated_gemm_txn.h &: build/dynsize/aie_gemm_dynamic.mlir build/mm_${m}x${k}x${n}.o
+	mkdir -p build/dynsize
+	cp build/mm_${m}x${k}x${n}.o build/dynsize/
+	cd build/dynsize && aiecc --alloc-scheme=basic-sequential \
+		--aie-generate-xclbin --xclbin-name=final_dynamic.xclbin \
+		--aie-generate-txn-cpp --txn-cpp-name=generated_gemm_txn.h \
+		--no-xchesscc --no-xbridge --peano ${PEANO_INSTALL_DIR} \
+		$(<:%=../../%)
+
+# Build test executable using auto-generated TXN (no --instr needed)
+dynamic_generated.exe: ${srcdir}/test_dynamic.cpp build/dynsize/generated_gemm_txn.h
+	${CXX} -std=c++23 -O2 -ggdb -o $@ $< \
+		-Ibuild/dynsize \
+		-I${MLIR_AIE_ROOT}/include \
+		-I${srcdir} -I${srcdir}/.. \
+		-I${XRT_INC_DIR} -I${TEST_UTILS_DIR} \
+		-I${RUNTIME_LIB_ROOT} \
+		-L${XRT_LIB_DIR} -lxrt_coreutil
+
+.PHONY: run_dynamic_generated
+run_dynamic_generated: dynamic_generated.exe build/dynsize/final_dynamic.xclbin
+	./$< -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 32 -K 32 -N 32 -v 1
+	./$< -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 64 -K 64 -N 64 -v 1
+	./$< -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 96 -K 96 -N 96 -v 1
+	./$< -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 128 -K 128 -N 128 -v 1
+
+.PHONY: run_dynamic
+run_dynamic: run_dynamic_generated
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/README.md b/programming_examples/basic/matrix_multiplication/single_core_dynamic/README.md
new file mode 100644
index 00000000000..c9cf6d771b5
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/README.md
@@ -0,0 +1,312 @@
+# Dynamic Single-Core Matrix Multiplication
+
+Single-core bf16 GEMM with **runtime-configurable matrix dimensions**. One compiled XCLBIN supports any M/K/N that are multiples of 32 — matrix sizes are determined at runtime, not compile time.
+
+## Quick Start
+
+```bash
+# Build everything (XCLBIN + C++ TXN code + test executable)
+make build/dynsize/final_dynamic.xclbin M=128 K=128 N=128 devicename=npu2
+make dynamic_generated.exe M=128 K=128 N=128 devicename=npu2
+
+# Run multiple sizes from the same XCLBIN
+./dynamic_generated.exe -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 32 -K 32 -N 32 -v 1
+./dynamic_generated.exe -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 64 -K 64 -N 64 -v 1
+./dynamic_generated.exe -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE -M 128 -K 128 -N 128 -v 1
+
+# Or run all at once
+make run_dynamic_generated M=128 K=128 N=128 devicename=npu2
+```
+
+## Performance
+
+All sizes from a single XCLBIN on NPU Strix Halo (NPU2):
+
+| Size | TXN Instructions | Time | GFLOPS | Status |
+|------|-----------------|------|--------|--------|
+| 32x32x32 | 225 words | 1275 us | 0.05 | PASS |
+| 64x32x64 | 357 words | 1096 us | 0.24 | PASS |
+| 64x64x64 | 357 words | 866 us | 0.61 | PASS |
+| 96x96x96 | 566 words | 760 us | 2.33 | PASS |
+| 128x64x128 | 698 words | 1103 us | 1.90 | PASS |
+| 128x128x128 | 698 words | 1575 us | 2.66 | PASS |
+
+## How It Works
+
+The design has a fixed **tile size** (32x32x32 bf16->f32) but **variable problem size** (M, K, N). The core runs an infinite loop that reads iteration counts from RTP (Runtime Tunable Parameters), and the host generates DMA instruction sequences at runtime for each problem size.
+
+### Architecture
+
+```
+                    ┌──────────────────────────┐
+                    │   single_core_dynamic.py  │
+                    │      --dynamic-txn        │
+                    └────────────┬─────────────┘
+                                 │
+                       aie_gemm_dynamic.mlir
+                    (one MLIR, three regions)
+                                 │
+              ┌──────────────────┼──────────────────┐
+              │                  │                   │
+         aie.core            aie.device          aie.runtime_sequence
+     (compute kernel)     (hw configuration)    (DMA orchestration)
+     scf.while loops      objectFIFOs, locks    SSA M/K/N params
+     RTP-driven bounds    buffer placement      scf.for/scf.if loops
+              │                  │                   │
+              ▼                  ▼                   ▼
+    ┌─────────────────┐ ┌──────────────────┐ ┌─────────────────────┐
+    │ Peano clang++   │ │ CDO/PDI/xclbin   │ │ EmitC → C++ codegen │
+    │ → core_0_2.elf  │ │ → final.xclbin   │ │ → generated_txn.h   │
+    └─────────────────┘ └──────────────────┘ └─────────────────────┘
+              │                  │                   │
+              └────────┬────────┘                   │
+                       ▼                            ▼
+              final_dynamic.xclbin          dynamic_generated.exe
+              (load once)                   (calls generate_txn_sequence
+                                             with runtime M, K, N)
+```
+
+### Compilation Flow
+
+A single `aiecc` invocation produces both the XCLBIN and the C++ TXN code:
+
+```bash
+aiecc --aie-generate-xclbin --xclbin-name=final_dynamic.xclbin \
+      --aie-generate-txn-cpp --txn-cpp-name=generated_gemm_txn.h \
+      aie_gemm_dynamic.mlir
+```
+
+This works because `aiecc` explicitly marks `aie.runtime_sequence` legal for
+the module-level SCF-to-CF conversion, which:
+- Preserves SCF ops in the runtime sequence for C++ codegen
+- Still lowers `aie.core` bodies for normal code generation
+- Allows both XCLBIN and TXN generation from the same MLIR with identical buffer addresses
+
+#### XCLBIN Path (hardware configuration)
+
+```
+MLIR → objectFifo lowering → buffer allocation → routing
+     → SCF→CF (core body only) → AIECoreToStandard → LLVM IR
+     → Peano opt/llc → ELF → CDO → PDI → xclbin
+```
+
+The XCLBIN contains the core ELF, tile configuration, DMA descriptors, locks, and routing. It does NOT contain the runtime DMA instruction sequence.
+
+#### TXN C++ Path (runtime instruction generation)
+
+```
+MLIR (clone) → NPU lowering pipeline
+  aiex.npu.dma_memcpy_nd (with SSA sizes/strides)
+    → arith ops computing BD words (d0_size * elemWidth / addrGran, ...)
+    → npu.write32 for each BD word
+    → npu.address_patch for buffer addresses
+    → npu.maskwrite32 for S2MM tokens
+    → npu.sync for completion
+  → ConvertAIEXToEmitCPass
+    → emitc.call_opaque("txn_append_write32", ...)
+    → emitc.for / emitc.if (from SCF)
+    → emitc.variable + emitc.assign (for iter_args/results)
+  → translateToCpp
+    → generated_gemm_txn.h
+```
+
+The generated C++ function `generate_txn_sequence(M, K, N)` returns a `std::vector<uint32_t>` of NPU instruction words.
+
+### Runtime Flow
+
+```
+1. Load XCLBIN (configures AIE array, loads core ELF)
+2. Allocate host buffers A[M×K bf16], B[K×N bf16], C[M×N f32]
+3. Call generate_txn_sequence(M, K, N)
+   → Writes RTP values (K/32, M/32 * N/32)
+   → Programs shim DMA BDs for A, B, C with dynamic strides/offsets
+   → Issues queue pushes with completion tokens
+   → Returns instruction word vector
+4. Submit instructions to NPU via XRT
+5. NPU executes: DMA streams tiles to core, core computes, results DMA back
+6. Read result buffer
+```
+
+### Detailed TXN Walkthrough
+
+The dynamic TXN path does not generate a new GEMM kernel. It generates the
+instruction stream that tells the already-compiled kernel how many 32x32x32
+tiles to process and how to move those tiles through shim DMA.
+
+#### 1. Fixed compute kernel
+
+The AIE core kernel in `aie_kernels/aie2p/mm.cc` is fixed at a
+**32x32x32 bf16->f32** tile shape. It uses `aie::mmul` and accumulates one
+output tile across multiple K-slices.
+
+The dynamic part is therefore not the math itself. The dynamic part is:
+
+- how many output tiles exist: `(M / 32) * (N / 32)`
+- how many K tiles each output tile accumulates: `K / 32`
+- which regions of the host buffers A, B, and C each DMA descriptor covers
+
+#### 2. Core-side runtime control
+
+Inside `single_core_dynamic.py`, the core reads two RTP values at runtime:
+
+- `rtp[0] = K_div_k`
+- `rtp[1] = total_tiles`
+
+Those values control the nested loops on the core:
+
+- outer loop over output tiles
+- inner loop over K accumulation steps
+
+So one fixed ELF can execute many GEMM sizes as long as M, K, and N remain
+multiples of 32.
+
+#### 3. Runtime sequence as a parameterized TXN program
+
+With `--dynamic-txn`, the runtime sequence itself takes `M`, `K`, and `N` as
+SSA values:
+
+```mlir
+aie.runtime_sequence(A, B, C, M, K, N)
+```
+
+The runtime sequence computes:
+
+- `M_div_m = M / 32`
+- `K_div_k = K / 32`
+- `N_div_n = N / 32`
+- `tiles = M_div_m * N_div_n`
+
+It then writes RTP values and emits DMA orchestration. The important thing is
+that the orchestration is still written in high-level MLIR using `arith`, `scf`,
+and `aiex.npu.dma_memcpy_nd`.
+
+#### 4. What NPU lowering turns that into
+
+Each `aiex.npu.dma_memcpy_nd` expands into explicit descriptor programming:
+
+- BD register writes (`npu.write32`)
+- host address patching (`npu.address_patch`)
+- queue setup and completion-token configuration (`npu.maskwrite32`)
+- completion waits (`npu.sync`)
+
+This is why the emitted C++ is verbose: by the time EmitC runs, the compiler is
+no longer expressing “copy a multidimensional slice”; it is expressing exact NPU
+transaction words.
+
+#### 5. What the generated C++ function does
+
+The emitted function has the shape:
+
+```cpp
+std::vector<uint32_t> generate_txn_sequence(int32_t M, int32_t K, int32_t N)
+```
+
+Conceptually it does:
+
+1. Compute `M/32`, `K/32`, `N/32`, and `tiles`.
+2. Emit two RTP writes.
+3. Loop over tile-row blocks.
+4. For each ping-pong half-block:
+   - compute DMA descriptor fields,
+   - append BD programming writes,
+   - append address patches,
+   - append queue pushes,
+   - append syncs as needed.
+5. Prepend the TXN header and return the word vector.
+
+The generated code is mechanically lowered from MLIR, so it contains many local
+temporaries (`v17`, `v18`, …). These are just SSA values printed as C++ locals.
+#### 6. Ping-pong scheduling
+
+The dynamic sequence processes output rows in blocks:
+
+- `rows_per_block = 4`
+- each ping-pong half handles up to 2 tile rows
+- two BD banks are alternated so one half-block can execute while the next is
+  being prepared
+
+The schedule looks like:
+
+- program output C DMA
+- program A and B input DMAs for one or two tile rows
+- optionally wait for the previous batch before BD reuse
+- push DMA queues
+- continue to the next half-block
+
+This is what lets one core stream larger GEMMs while reusing a fixed microkernel.
+
+#### 7. Why this feature matters
+
+Without runtime-parameterized TXN generation, you typically need:
+
+- one XCLBIN
+- one fixed instruction stream
+- one fixed problem size
+
+With this feature, you get:
+
+- one XCLBIN
+- one fixed core ELF
+- one generated host function that builds a fresh instruction stream for each
+  runtime `M`, `K`, `N`
+
+In practice, that means the expensive parts stay fixed:
+
+- placement
+- routing
+- core code generation
+- XCLBIN creation
+
+while the cheap part changes at runtime:
+
+- the transaction stream that sets RTPs and programs DMA descriptors
+
+That is the core value of the feature: **compile once, vary GEMM shape at
+runtime by regenerating only the TXN program**.
+
+## Design Variants
+
+| File | Description |
+|------|-------------|
+| `single_core_dynamic.py` | Low-level dialect with `--dynamic-txn` flag |
+| `single_core_dynamic_placed.py` | Placed API variant |
+| `single_core_dynamic_iron.py` | IRON high-level API variant |
+
+## Key Design Decisions
+
+**Fixed tile size (32x32x32)**: The AIE core microkernel (`mm.cc`) operates on fixed 32x32 tiles. The dynamic part is how many tiles are processed and the DMA pattern for streaming them.
+
+**RTP for loop bounds**: The core reads K_div_k and total_tiles from an RTP buffer via `memref.load`. The host writes these before issuing DMAs. The DMA start acts as an implicit ordering barrier.
+
+**Pingpong DMA pattern**: Two sets of BDs (even/odd) overlap compute and data movement. Each half-block processes up to 2 tile rows. The `scf.for` in the runtime_sequence iterates over tile-row blocks, with `scf.if` guards for boundary conditions.
+
+**SCF preservation**: `aiecc` explicitly keeps `aie.runtime_sequence` legal
+during the module-level SCF-to-CF conversion, so SCF stays available for the
+EmitC path while `aie.core` regions continue through the normal lowering flow.
+
+## Constraints
+
+- M, K, N must be multiples of 32 (the tile size)
+- bf16 input, f32 output only (kernel constraint)
+- Single core (tile 0,2) — no multi-core parallelism
+- NPU2 (Strix Halo) target only
+
+## Files
+
+```
+single_core_dynamic/
+├── single_core_dynamic.py      # Python design (static + --dynamic-txn)
+├── single_core_dynamic_placed.py
+├── single_core_dynamic_iron.py
+├── test_dynamic.cpp            # Test harness (auto-generated TXN path)
+├── Makefile
+├── tests/
+│   ├── run_strix_makefile.lit
+│   ├── run_strix_makefile_placed.lit
+│   ├── run_strix_makefile_iron.lit
+│   └── run_strix_makefile_generated.lit
+└── build/dynsize/              # Generated artifacts
+    ├── aie_gemm_dynamic.mlir   # Dynamic MLIR (with SSA M/K/N)
+    ├── final_dynamic.xclbin    # Hardware configuration
+    └── generated_gemm_txn.h    # Auto-generated C++ TXN function
+```
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_makefile_dynamic.lit b/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_makefile_dynamic.lit
new file mode 100644
index 00000000000..9a5a62e690d
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_makefile_dynamic.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai_npu1, peano
+//
+// RUN: mkdir -p test_dynamic
+// RUN: cd test_dynamic
+// RUN: make -f %S/Makefile clean
+// RUN: %run_on_npu1% make -f %S/Makefile run_dynamic_generated devicename=npu M=128 K=128 N=128
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_strix_makefile_dynamic.lit b/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_strix_makefile_dynamic.lit
new file mode 100644
index 00000000000..5879c9a449d
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/run_strix_makefile_dynamic.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// RUN: mkdir -p test_dynamic_stx
+// RUN: cd test_dynamic_stx
+// RUN: make -f %S/Makefile clean
+// RUN: %run_on_npu2% make -f %S/Makefile run_dynamic_generated devicename=npu2 M=128 K=128 N=128
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic.py b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic.py
new file mode 100644
index 00000000000..b6e5ae8b2e3
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic.py
@@ -0,0 +1,444 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+#
+# Dynamic single-core matrix multiplication.
+#
+# This is a minimal delta over single_core.py. The differences are:
+#   1. The runtime_sequence takes M, K, N as SSA i32 inputs, so the same
+#      compiled XCLBIN handles any (multiple-of-tile) shape at runtime.
+#   2. The runtime_sequence body uses range_ / if_ (which emit scf.for /
+#      scf.if at MLIR build time) wherever single_core.py uses Python
+#      range / if (which elaborate at Python time over Python-int bounds).
+#   3. The core body reads its loop trip counts from an RTP buffer that
+#      the host populates at the start of the runtime_sequence.
+import argparse
+import numpy as np
+import sys
+
+from aie.extras.context import mlir_mod_ctx
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects import arith, memref
+from aie.helpers.dialects.scf import if_
+from aie.extras.dialects.arith import constant
+from aie.extras import types as T
+import aie.utils.trace as trace_utils
+from aie.iron.controlflow import range_
+from aie.iron.dtype import str_to_dtype
+
+microkernel_mac_dim_map = {
+    "npu": {
+        "bf16": (4, 8, 4),
+        "i8": (4, 8, 8),
+        "i16": (4, 4, 4),
+    },
+    "npu2": {
+        "bf16": {
+            # emulate_bf16_mmul_with_bfp16
+            True: (8, 8, 8),
+            False: (4, 8, 8),
+        },
+        "i8": (8, 8, 8),
+        "i16": (4, 4, 8),
+    },
+}
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        prog="AIE Matrix Multiplication MLIR Design (Single Core, Dynamic)",
+        description=(
+            "Emits MLIR for a matrix multiplication design where M, K, N are "
+            "SSA i32 inputs to the runtime_sequence. The compiled XCLBIN can "
+            "be re-driven at any (multiple-of-tile) shape at host time."
+        ),
+    )
+    argparser.add_argument("--dev", type=str, choices=["npu", "npu2"], default="npu")
+    argparser.add_argument("-M", type=int, default=256)
+    argparser.add_argument("-K", type=int, default=256)
+    argparser.add_argument("-N", type=int, default=256)
+    argparser.add_argument("-m", type=int, default=64)
+    argparser.add_argument("-k", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=32)
+    argparser.add_argument(
+        "--dtype_in", type=str, choices=["bf16", "i8", "i16"], default="i16"
+    )
+    argparser.add_argument(
+        "--dtype_out",
+        type=str,
+        choices=["bf16", "i8", "i16", "f32", "i32"],
+        default="i32",
+    )
+    argparser.add_argument("--b-col-maj", type=int, choices=[0, 1], default=0)
+    argparser.add_argument("--emulate-bf16-mmul-with-bfp16", type=bool, default=False)
+    argparser.add_argument("--trace_size", type=int, default=0)
+    args = argparser.parse_args()
+    my_matmul(
+        args.dev,
+        args.M,
+        args.K,
+        args.N,
+        args.m,
+        args.k,
+        args.n,
+        args.dtype_in,
+        args.dtype_out,
+        args.b_col_maj,
+        args.emulate_bf16_mmul_with_bfp16,
+        args.trace_size,
+    )
+
+
+def ceildiv(a, b):
+    return (a + b - 1) // b
+
+
+def my_matmul(
+    dev,
+    M,
+    K,
+    N,
+    m,
+    k,
+    n,
+    dtype_in_str,
+    dtype_out_str,
+    b_col_maj,
+    emulate_bf16_mmul_with_bfp16,
+    trace_size,
+):
+
+    assert M % m == 0
+    assert K % k == 0
+    assert N % n == 0
+
+    # r, s, t are the dimensions required by the microkernel MAC instructions.
+    mac_dims = microkernel_mac_dim_map[dev][dtype_in_str]
+    if dev == "npu2" and dtype_in_str == "bf16":
+        r, s, t = mac_dims[emulate_bf16_mmul_with_bfp16]
+    else:
+        r, s, t = mac_dims
+
+    assert m % r == 0
+    assert k % s == 0
+    assert n % t == 0
+
+    vectorized = True
+    enable_tracing = True if trace_size > 0 else False
+
+    dtype_in = str_to_dtype(dtype_in_str)
+    dtype_out = str_to_dtype(dtype_out_str)
+
+    assert np.issubdtype(dtype_in, np.integer) == np.issubdtype(
+        dtype_out, np.integer
+    ), f"Input dtype ({dtype_in}) and output dtype ({dtype_out}) must either both be integral or both be float"
+    assert (
+        np.dtype(dtype_out).itemsize >= np.dtype(dtype_in).itemsize
+    ), f"Output dtype ({dtype_out}) must be equal or larger to input dtype ({dtype_in})"
+
+    A_sz = M * K
+    B_sz = K * N
+    C_sz = M * N
+
+    with mlir_mod_ctx() as ctx:
+
+        if dev == "npu":
+            dev_ty = AIEDevice.npu1_1col
+        else:
+            dev_ty = AIEDevice.npu2
+
+        @device(dev_ty)
+        def device_body():
+            a_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
+            b_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
+            c_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
+
+            # AIE Core Function declarations
+            func_type = "" if vectorized else "scalar_"
+            zero = external_func(
+                f"zero_{func_type}{dtype_out_str}",
+                inputs=[c_ty],
+                link_with=f"mm_{m}x{k}x{n}.o",
+            )
+            matmul_func_name = f"matmul_{func_type}{dtype_in_str}_{dtype_out_str}"
+            matmul = external_func(
+                matmul_func_name,
+                inputs=[a_ty, b_ty, c_ty],
+                link_with=f"mm_{m}x{k}x{n}.o",
+            )
+
+            # Tile declarations
+            shim_tile = tile(0, 0)
+            mem_tile = tile(0, 1)
+            compute_tile2_col, compute_tile2_row = 0, 2
+            compute_tile2 = tile(compute_tile2_col, compute_tile2_row)
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", shim_tile, mem_tile, 2, a_ty)
+            memA = object_fifo(
+                "memA",
+                mem_tile,
+                compute_tile2,
+                2,
+                a_ty,
+                (
+                    [
+                        (m // r, r * k),
+                        (k // s, s),
+                        (r, k),
+                        (s, 1),
+                    ]
+                    if vectorized
+                    else []
+                ),
+            )
+            object_fifo_link(inA, memA)
+
+            # Input B
+            inB = object_fifo("inB", shim_tile, mem_tile, 2, b_ty)
+
+            B_transformations = []
+            if vectorized:
+                if not b_col_maj:
+                    B_transformations = [
+                        (k // s, s * n),
+                        (n // t, t),
+                        (s, n),
+                        (t, 1),
+                    ]
+                else:
+                    B_transformations = [
+                        (n // t, t * k),
+                        (k // s, s),
+                        (t, k),
+                        (s, 1),
+                    ]
+
+            memB = object_fifo(
+                "memB",
+                mem_tile,
+                compute_tile2,
+                2,
+                b_ty,
+                B_transformations,
+            )
+
+            object_fifo_link(inB, memB)
+
+            # Output C
+            memC = object_fifo("memC", compute_tile2, mem_tile, 2, c_ty)
+            outC = object_fifo(
+                "outC",
+                mem_tile,
+                shim_tile,
+                2,
+                c_ty,
+                (
+                    [
+                        (m // r, r * n),
+                        (r, t),
+                        (n // t, r * t),
+                        (t, 1),
+                    ]
+                    if vectorized
+                    else []
+                ),
+            )
+            object_fifo_link(memC, outC)
+
+            # RTP buffer shared with the host. Layout:
+            #   rtp[0] = K_div_k  (inner-K loop trip count)
+            #   rtp[1] = tiles    (outer tile loop trip count)
+            rtp_buf = buffer(compute_tile2, T.memref(2, T.i32()), name="rtp")
+
+            # Set up a packet-switched flow from core to shim for tracing information
+            tiles_to_trace = [compute_tile2]
+            if trace_size > 0:
+                trace_utils.configure_trace(
+                    tiles_to_trace,
+                    coretile_events=[
+                        # captures input A (PORT_RUNNING_0, DMA channel 0, master for inputs)
+                        trace_utils.events.PortEvent(
+                            trace_utils.events.CoreEvent.PORT_RUNNING_0,
+                            trace_utils.events.WireBundle.DMA,
+                            0,
+                            True,
+                        ),
+                        # captures input B (PORT_RUNNING_1, DMA channel 1, master for inputs)
+                        trace_utils.events.PortEvent(
+                            trace_utils.events.CoreEvent.PORT_RUNNING_1,
+                            trace_utils.events.WireBundle.DMA,
+                            1,
+                            True,
+                        ),
+                        # captures output C (PORT_RUNNING_2, DMA channel 0, slave for outputs)
+                        trace_utils.events.PortEvent(
+                            trace_utils.events.CoreEvent.PORT_RUNNING_2,
+                            trace_utils.events.WireBundle.DMA,
+                            0,
+                            False,
+                        ),
+                        trace_utils.events.CoreEvent.INSTR_EVENT_0,
+                        trace_utils.events.CoreEvent.INSTR_EVENT_1,
+                        trace_utils.events.CoreEvent.MEMORY_STALL,
+                        trace_utils.events.CoreEvent.LOCK_STALL,
+                        trace_utils.events.CoreEvent.INSTR_VECTOR,
+                    ],
+                )
+
+            # The stack size choice is an important choice!
+            # The Peano compiler uses a stack size in this kernel greater than the default one
+            # (default is 0x400, chess' stack size is smaller).
+            # Exceding the stack size leads to wrong results from the kernel, but no error is triggered.
+            # Stack usage can be checked as explained here:
+            # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585
+            @core(compute_tile2, stack_size=0xD00, dynamic_objfifo_lowering=True)
+            def core_body():
+                # Loop trip counts come from RTP rather than Python ints, so
+                # the compiled core handles any (M, K, N) the host writes.
+                c0_idx = constant(0, index=True)
+                c1_idx = constant(1, index=True)
+                for _ in range_(0xFFFFFFFF):
+                    tiles = memref.load(rtp_buf, [c1_idx])
+                    K_div_k = memref.load(rtp_buf, [c0_idx])
+                    for _ in range_(tiles):
+
+                        elem_out = memC.acquire(ObjectFifoPort.Produce, 1)
+                        zero(elem_out)
+
+                        for _ in range_(K_div_k):
+                            elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1)
+                            elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1)
+                            matmul(elem_in_a, elem_in_b, elem_out)
+                            memA.release(ObjectFifoPort.Consume, 1)
+                            memB.release(ObjectFifoPort.Consume, 1)
+
+                        memC.release(ObjectFifoPort.Produce, 1)
+
+            # To/from AIE-array data movement
+
+            @runtime_sequence(
+                np.ndarray[(A_sz,), np.dtype[dtype_in]],
+                np.ndarray[(B_sz,), np.dtype[dtype_in]],
+                np.ndarray[(C_sz,), np.dtype[dtype_out]],
+                T.i32(),  # M
+                T.i32(),  # K
+                T.i32(),  # N
+            )
+            def sequence(A, B, C, M, K, N):
+
+                if enable_tracing:
+                    trace_utils.start_trace(trace_size=trace_size)
+
+                # Tile counts: same Python expressions as in single_core.py.
+                # Here M, K, N are SSA i32, so eudsl operator overloads emit
+                # arith.floordivsi / arith.muli automatically.
+                M_div_m = M // m
+                K_div_k = K // k
+                N_div_n = N // n
+                tiles = M_div_m * N_div_n
+
+                # Push trip counts to the core via the RTP buffer.
+                npu_rtp_write("rtp", 0, K_div_k)
+                npu_rtp_write("rtp", 1, tiles)
+
+                # only do 4 tile rows at a time before synchronizing, so we can reuse BDs
+                rows_per_block = 4
+                # The body below mirrors single_core.py's runtime_sequence.
+                # The only changes from the static version are:
+                #   * range  -> range_   (emits scf.for over an SSA bound)
+                #   * if num_tile_rows <= 0: break
+                #         -> with if_(num_tile_rows > 0, hasElse=False):
+                #   * min(...) -> arith.minsi(...)   (Python min cannot
+                #     bool() an SSA i1, so we call the op directly)
+                #   * the inner `for tile_row in range(num_tile_rows)` is
+                #     unrolled to range(rows_per_block // 2) with an if_
+                #     guard, since bd_id must be a Python-time integer.
+                #   * dma_wait(outC) -> npu_sync(...)   (dma_wait inside
+                #     scf.if has terminator-conversion issues today).
+                for tile_row_block in range_(ceildiv(M_div_m, rows_per_block)):
+                    # we only sync on half the BDs before reusing them, so the other half can concurrently keep running
+                    # that's what this loop is for
+                    for pingpong in [0, 1]:
+                        C_row_offset = (
+                            tile_row_block * rows_per_block * m * N
+                            + pingpong * rows_per_block // 2 * m * N
+                        )
+                        row_base = (
+                            tile_row_block * rows_per_block
+                            + pingpong * rows_per_block // 2
+                        )
+                        bd_id_base = 8 * pingpong
+                        num_tile_rows = arith.minsi(
+                            constant(rows_per_block // 2, T.i32()),
+                            M_div_m - row_base,
+                        )
+                        with if_(num_tile_rows > 0, hasElse=False):
+                            npu_dma_memcpy_nd(
+                                metadata=outC,
+                                bd_id=bd_id_base,
+                                mem=C,
+                                offsets=[0, 0, 0, C_row_offset],
+                                sizes=[num_tile_rows, N_div_n, m, n],
+                                strides=[m * N, n, N, 1],
+                            )
+                            for tile_row in range(rows_per_block // 2):
+                                A_row_offset = (row_base + tile_row) * m * K
+
+                                def emit_ab():
+                                    npu_dma_memcpy_nd(
+                                        metadata=inA,
+                                        bd_id=bd_id_base + 2 * tile_row + 1,
+                                        mem=A,
+                                        offsets=[0, 0, 0, A_row_offset],
+                                        sizes=[N_div_n, K_div_k, m, k],
+                                        strides=[0, k, K, 1],
+                                    )
+
+                                    if not b_col_maj:
+                                        B_sizes = [N_div_n, K_div_k, k, n]
+                                        B_strides = [n, k * N, N, 1]
+                                    else:
+                                        B_sizes = [N_div_n, K_div_k, n, k]
+                                        B_strides = [n * K, k, K, 1]
+
+                                    npu_dma_memcpy_nd(
+                                        metadata=inB,
+                                        bd_id=bd_id_base + 2 * tile_row + 2,
+                                        mem=B,
+                                        sizes=B_sizes,
+                                        strides=B_strides,
+                                    )
+
+                                if tile_row == 0:
+                                    emit_ab()
+                                else:
+                                    with if_(num_tile_rows > tile_row, hasElse=False):
+                                        emit_ab()
+
+                            # Sync condition mirrors single_core.py:
+                            #   tile_row_block > 0 OR pingpong > 0
+                            # pingpong > 0 is always true at Python time, so
+                            # only the pingpong == 0 branch needs an SSA guard.
+                            if pingpong > 0:
+                                npu_sync(column=0, row=0, direction=0, channel=0)
+                            else:
+                                with if_(tile_row_block > 0, hasElse=False):
+                                    npu_sync(column=0, row=0, direction=0, channel=0)
+
+                # Final sync.
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+if __name__ == "__main__":
+    main()
+else:
+    print("Not meant to be imported")
+    sys.exit(1)
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_iron.py b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_iron.py
new file mode 100644
index 00000000000..90ddaa346aa
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_iron.py
@@ -0,0 +1,251 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+#
+# IRON-level dynamic single-core GEMM (bf16->f32, 32x32x32 tiles).
+#
+# Uses IRON high-level abstractions (Worker, ObjectFifo, Kernel, Runtime,
+# Program) with RTP-based dynamic loop bounds for compile-once-run-any-size.
+# Fixed tile sizes (32x32x32) are compiled once; the host sets loop bounds
+# (K iterations, tile count) via RTP before each run.
+
+import argparse
+import numpy as np
+
+from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker, Buffer, str_to_dtype
+from aie.iron.device import NPU2
+from aie.iron.controlflow import range_
+from aie.helpers.taplib import TensorTiler2D
+
+from aie.dialects.scf import WhileOp, condition, yield_
+from aie.dialects.arith import CmpIOp, CmpIPredicate, AddIOp, IndexCastOp
+from aie.dialects.memref import LoadOp
+from aie.extras.dialects.arith import constant
+from aie.ir import IndexType, InsertionPoint
+from aie.extras import types as T
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        prog="AIE Dynamic Matrix Multiplication MLIR Design (Single Core, IRON)",
+        description="Emits MLIR code for a dynamic matrix multiplication design using IRON APIs",
+    )
+    argparser.add_argument("--dev", type=str, choices=["npu2"], default="npu2")
+    argparser.add_argument("-M", type=int, default=128)
+    argparser.add_argument("-K", type=int, default=128)
+    argparser.add_argument("-N", type=int, default=128)
+    argparser.add_argument("--dtype_in", type=str, choices=["bf16"], default="bf16")
+    argparser.add_argument("--dtype_out", type=str, choices=["f32"], default="f32")
+    argparser.add_argument("--trace_size", type=int, default=0)
+    args = argparser.parse_args()
+    module = my_matmul(
+        args.dev, args.M, args.K, args.N, args.dtype_in, args.dtype_out, args.trace_size
+    )
+    print(module)
+
+
+def ceildiv(a, b):
+    return (a + b - 1) // b
+
+
+def my_matmul(dev, M, K, N, dtype_in_str, dtype_out_str, trace_size):
+    # Fixed tile sizes for dynamic design
+    m, k, n = 32, 32, 32
+
+    assert M % m == 0
+    assert K % k == 0
+    assert N % n == 0
+
+    # NPU2 bf16 microkernel dimensions: r=4, s=8, t=8
+    r, s, t = 4, 8, 8
+
+    dtype_in = str_to_dtype(dtype_in_str)
+    dtype_out = str_to_dtype(dtype_out_str)
+
+    M_div_m = M // m
+    K_div_k = K // k
+    N_div_n = N // n
+    tiles = M_div_m * N_div_n
+
+    # Define tensor types
+    A_ty = np.ndarray[(M * K,), np.dtype[dtype_in]]
+    B_ty = np.ndarray[(K * N,), np.dtype[dtype_in]]
+    C_ty = np.ndarray[(M * N,), np.dtype[dtype_out]]
+    a_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
+    b_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
+    c_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
+
+    # AIE Core Function declarations
+    zero_kernel = Kernel(f"zero_{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [c_ty])
+    matmul_kernel = Kernel(
+        f"matmul_{dtype_in_str}_{dtype_out_str}",
+        f"mm_{m}x{k}x{n}.o",
+        [a_ty, b_ty, c_ty],
+    )
+
+    # AIE-array data movement with object fifos
+    # Input A
+    inA = ObjectFifo(a_ty, name="inA")
+    a_dims = [(m // r, r * k), (k // s, s), (r, k), (s, 1)]
+    memA = inA.cons().forward(name="memA", dims_to_stream=a_dims)
+
+    # Input B
+    inB = ObjectFifo(b_ty, name="inB")
+    b_dims = [(k // s, s * n), (n // t, t), (s, n), (t, 1)]
+    memB = inB.cons().forward(name="memB", dims_to_stream=b_dims)
+
+    # Output C
+    memC = ObjectFifo(c_ty, name="memC")
+    c_dims = [(m // r, r * n), (r, t), (n // t, r * t), (t, 1)]
+    outC = memC.cons().forward(name="outC", dims_to_stream=c_dims)
+
+    # RTP buffer: [0] = K_div_k, [1] = tiles (M_div_m * N_div_n)
+    rtp_buf = Buffer(
+        type=np.ndarray[(16,), np.dtype[np.int32]],
+        name="rtp",
+        use_write_rtp=True,
+    )
+
+    # Core function with dynamic loop bounds via RTP
+    def core_fn(of_a, of_b, of_c, zero, matmul, rtp):
+        idx_ty = IndexType.get()
+        c0_idx = constant(0, index=True)
+        c1_idx = constant(1, index=True)
+
+        # Read tile count from RTP[1]
+        tiles_i32 = LoadOp(rtp.op, [c1_idx]).result
+        tiles_idx = IndexCastOp(idx_ty, tiles_i32).result
+
+        # Tile loop (scf.while for dynamic bounds)
+        tile_while = WhileOp([idx_ty], [c0_idx])
+
+        # "before" region: check condition
+        before_block = tile_while.before.blocks.append(idx_ty)
+        with InsertionPoint(before_block):
+            tile_iter = before_block.arguments[0]
+            tile_cond = CmpIOp(CmpIPredicate.slt, tile_iter, tiles_idx).result
+            condition(tile_cond, [tile_iter])
+
+        # "after" region: loop body
+        after_block = tile_while.after.blocks.append(idx_ty)
+        with InsertionPoint(after_block):
+            tile_iter = after_block.arguments[0]
+
+            elem_out = of_c.acquire(1)
+            zero(elem_out)
+
+            # Read K_div_k from RTP[0]
+            k_iters_i32 = LoadOp(rtp.op, [c0_idx]).result
+            k_iters_idx = IndexCastOp(idx_ty, k_iters_i32).result
+
+            # K accumulation loop (scf.while)
+            k_while = WhileOp([idx_ty], [c0_idx])
+
+            k_before = k_while.before.blocks.append(idx_ty)
+            with InsertionPoint(k_before):
+                k_iter = k_before.arguments[0]
+                k_cond = CmpIOp(CmpIPredicate.slt, k_iter, k_iters_idx).result
+                condition(k_cond, [k_iter])
+
+            k_after = k_while.after.blocks.append(idx_ty)
+            with InsertionPoint(k_after):
+                k_iter = k_after.arguments[0]
+
+                elem_in_a = of_a.acquire(1)
+                elem_in_b = of_b.acquire(1)
+                matmul(elem_in_a, elem_in_b, elem_out)
+                of_a.release(1)
+                of_b.release(1)
+
+                k_next = AddIOp(k_iter, c1_idx).result
+                yield_([k_next])
+
+            of_c.release(1)
+
+            tile_next = AddIOp(tile_iter, c1_idx).result
+            yield_([tile_next])
+
+    # Create worker with dynamic objfifo lowering
+    worker = Worker(
+        core_fn,
+        [
+            memA.cons(),
+            memB.cons(),
+            memC.prod(),
+            zero_kernel,
+            matmul_kernel,
+            rtp_buf,
+        ],
+        stack_size=0xD00,
+        dynamic_objfifo_lowering=True,
+    )
+
+    # only do 4 tile rows at a time before synchronizing, so we can reuse BDs
+    rows_per_block = 4
+
+    # Define tensor access patterns for inputs/outputs
+    A_tiles = TensorTiler2D.group_tiler(
+        (M, K), (m, k), (1, K_div_k), pattern_repeat=N_div_n, prune_step=False
+    )
+    b_tap = TensorTiler2D.group_tiler(
+        (K, N),
+        (k, n),
+        (K_div_k, N_div_n),
+        tile_group_col_major=True,
+        prune_step=False,
+    )[0]
+    C_tiles = TensorTiler2D.group_tiler(
+        (M, N), (m, n), (rows_per_block // 2, N_div_n), prune_step=False
+    )
+    c_index = 0
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(A_ty, B_ty, C_ty) as (A, B, C):
+        rt.start(worker)
+
+        # Write RTP values for the static compilation size
+        rt.write_rtp(rtp_buf, 0, K_div_k)
+        rt.write_rtp(rtp_buf, 1, tiles)
+
+        tgs = []
+        for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
+            for pingpong in [0, 1]:
+                row_base = (
+                    tile_row_block * rows_per_block + pingpong * rows_per_block // 2
+                )
+                num_tile_rows = min([rows_per_block // 2, M_div_m - row_base])
+                if num_tile_rows <= 0:
+                    break
+                tgs.append(rt.task_group())
+                for tile_row in range(num_tile_rows):
+                    tile_offset = (row_base + tile_row) % len(A_tiles)
+                    rt.fill(inA.prod(), A, tap=A_tiles[tile_offset], task_group=tgs[-1])
+                    rt.fill(inB.prod(), B, tap=b_tap, task_group=tgs[-1])
+
+                rt.drain(
+                    outC.cons(), C, tap=C_tiles[c_index], task_group=tgs[-1], wait=True
+                )
+                c_index += 1
+
+                if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
+                    rt.finish_task_group(tgs[-2])
+                    del tgs[-2]
+
+        rt.finish_task_group(tgs[-1])
+        del tgs[-1]
+
+    # Create the program from the device type and runtime
+    dev_ty = NPU2()
+    my_program = Program(dev_ty, rt)
+
+    # Place components and generate MLIR module
+    module = my_program.resolve_program()
+    return module
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_placed.py b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_placed.py
new file mode 100644
index 00000000000..f2f77d71244
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/single_core_dynamic_placed.py
@@ -0,0 +1,378 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+#
+# Dynamic single-core GEMM design with placed runtime sequence (bf16->f32, 32x32x32 tiles).
+#
+# Uses the dynamic core body (RTP reads + scf.while loops) for compile-once-run-any-size,
+# combined with the placed runtime sequence pattern (shim_dma_single_bd_task / dma_start_task
+# / dma_await_task / dma_free_task) instead of npu_dma_memcpy_nd.
+
+import argparse
+import numpy as np
+import sys
+
+from aie.extras.context import mlir_mod_ctx
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import WhileOp, condition, yield_
+from aie.dialects.arith import (
+    constant as arith_constant,
+    CmpIOp,
+    CmpIPredicate,
+    AddIOp,
+    IndexCastOp,
+)
+from aie.dialects.memref import LoadOp
+from aie.extras.dialects.arith import constant
+import aie.utils.trace as trace_utils
+from aie.helpers.taplib import TensorTiler2D
+from aie.iron.controlflow import range_
+from aie.iron.dtype import str_to_dtype
+from aie.ir import IndexType, IntegerType, InsertionPoint
+from aie.extras import types as T
+
+
+def make_port_event(code, channel: int, master: bool = True):
+    try:
+        return trace_utils.events.PortEvent(
+            code, WireBundle.DMA, channel, master=master
+        )
+    except TypeError:
+        return trace_utils.events.PortEvent(code, channel, master=master)
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        prog="AIE Dynamic Matrix Multiplication MLIR Design (Single Core, Placed)",
+        description="Emits MLIR code for a dynamic matrix multiplication design with placed runtime sequence",
+    )
+    argparser.add_argument("--dev", type=str, choices=["npu2"], default="npu2")
+    argparser.add_argument("-M", type=int, default=128)
+    argparser.add_argument("-K", type=int, default=128)
+    argparser.add_argument("-N", type=int, default=128)
+    argparser.add_argument("--dtype_in", type=str, choices=["bf16"], default="bf16")
+    argparser.add_argument("--dtype_out", type=str, choices=["f32"], default="f32")
+    argparser.add_argument("--trace_size", type=int, default=0)
+    args = argparser.parse_args()
+    my_matmul(
+        args.dev,
+        args.M,
+        args.K,
+        args.N,
+        args.dtype_in,
+        args.dtype_out,
+        args.trace_size,
+    )
+
+
+def ceildiv(a, b):
+    return (a + b - 1) // b
+
+
+def my_matmul(dev, M, K, N, dtype_in_str, dtype_out_str, trace_size):
+    # Fixed tile sizes for dynamic design
+    m, k, n = 32, 32, 32
+
+    assert M % m == 0
+    assert K % k == 0
+    assert N % n == 0
+
+    # NPU2 bf16 microkernel dimensions: r=4, s=8, t=8
+    r, s, t = 4, 8, 8
+
+    dtype_in = str_to_dtype(dtype_in_str)
+    dtype_out = str_to_dtype(dtype_out_str)
+
+    A_sz = M * K
+    B_sz = K * N
+    C_sz = M * N
+
+    M_div_m = M // m
+    K_div_k = K // k
+    N_div_n = N // n
+    tiles = M_div_m * N_div_n
+
+    enable_tracing = trace_size > 0
+
+    with mlir_mod_ctx() as ctx:
+        dev_ty = AIEDevice.npu2
+
+        @device(dev_ty)
+        def device_body():
+            a_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
+            b_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
+            c_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
+
+            # AIE Core Function declarations
+            zero = external_func(f"zero_{dtype_out_str}", inputs=[c_ty])
+            matmul = external_func(
+                f"matmul_{dtype_in_str}_{dtype_out_str}",
+                inputs=[a_ty, b_ty, c_ty],
+            )
+
+            # Tile declarations
+            shim_tile = tile(0, 0)
+            mem_tile = tile(0, 1)
+            compute_tile2_col, compute_tile2_row = 0, 2
+            compute_tile2 = tile(compute_tile2_col, compute_tile2_row)
+
+            # AIE-array data movement with object fifos
+            # Input A
+            inA = object_fifo("inA", shim_tile, mem_tile, 2, a_ty)
+            memA = object_fifo(
+                "memA",
+                mem_tile,
+                compute_tile2,
+                2,
+                a_ty,
+                [
+                    (m // r, r * k),
+                    (k // s, s),
+                    (r, k),
+                    (s, 1),
+                ],
+            )
+            object_fifo_link(inA, memA)
+
+            # Input B
+            inB = object_fifo("inB", shim_tile, mem_tile, 2, b_ty)
+            memB = object_fifo(
+                "memB",
+                mem_tile,
+                compute_tile2,
+                2,
+                b_ty,
+                [
+                    (k // s, s * n),
+                    (n // t, t),
+                    (s, n),
+                    (t, 1),
+                ],
+            )
+            object_fifo_link(inB, memB)
+
+            # Output C
+            memC = object_fifo("memC", compute_tile2, mem_tile, 2, c_ty)
+            outC = object_fifo(
+                "outC",
+                mem_tile,
+                shim_tile,
+                2,
+                c_ty,
+                [
+                    (m // r, r * n),
+                    (r, t),
+                    (n // t, r * t),
+                    (t, 1),
+                ],
+            )
+            object_fifo_link(memC, outC)
+
+            # RTP buffer: [0] = K_div_k, [1] = tiles (M_div_m * N_div_n)
+            rtp_buf = buffer(
+                compute_tile2,
+                T.memref(16, T.i32()),
+                name="rtp",
+                address=0x600,
+            )
+
+            # Set up tracing
+            tiles_to_trace = [compute_tile2]
+            if enable_tracing:
+                trace_utils.configure_packet_tracing_flow(tiles_to_trace, shim_tile)
+
+            # Core body with dynamic loop bounds via RTP
+            @core(
+                compute_tile2,
+                f"mm_{m}x{k}x{n}.o",
+                stack_size=0xD00,
+                dynamic_objfifo_lowering=True,
+            )
+            def core_body():
+                i32_ty = IntegerType.get_signless(32)
+                idx_ty = IndexType.get()
+
+                c0 = arith_constant(i32_ty, 0)
+                c1 = arith_constant(i32_ty, 1)
+                c0_idx = constant(0, index=True)
+                c1_idx = constant(1, index=True)
+                cmax_idx = constant(0xFFFFFFFF, index=True)
+
+                # Infinite outer loop
+                for _ in range_(c0_idx, cmax_idx, c1_idx):
+
+                    # Read tiles count from RTP[1]
+                    tiles_i32 = LoadOp(rtp_buf, [c1_idx]).result
+                    tiles_idx = IndexCastOp(idx_ty, tiles_i32).result
+
+                    # Tile loop (scf.while for dynamic bounds)
+                    tile_while = WhileOp([idx_ty], [c0_idx])
+
+                    # "before" region: check condition
+                    before_block = tile_while.before.blocks.append(idx_ty)
+                    with InsertionPoint(before_block):
+                        tile_iter = before_block.arguments[0]
+                        tile_cond = CmpIOp(
+                            CmpIPredicate.slt, tile_iter, tiles_idx
+                        ).result
+                        condition(tile_cond, [tile_iter])
+
+                    # "after" region: loop body
+                    after_block = tile_while.after.blocks.append(idx_ty)
+                    with InsertionPoint(after_block):
+                        tile_iter = after_block.arguments[0]
+
+                        elem_out = memC.acquire(ObjectFifoPort.Produce, 1)
+                        zero(elem_out)
+
+                        # Read K_div_k from RTP[0]
+                        k_iters_i32 = LoadOp(rtp_buf, [c0_idx]).result
+                        k_iters_idx = IndexCastOp(idx_ty, k_iters_i32).result
+
+                        # K accumulation loop (scf.while)
+                        k_while = WhileOp([idx_ty], [c0_idx])
+
+                        k_before = k_while.before.blocks.append(idx_ty)
+                        with InsertionPoint(k_before):
+                            k_iter = k_before.arguments[0]
+                            k_cond = CmpIOp(
+                                CmpIPredicate.slt, k_iter, k_iters_idx
+                            ).result
+                            condition(k_cond, [k_iter])
+
+                        k_after = k_while.after.blocks.append(idx_ty)
+                        with InsertionPoint(k_after):
+                            k_iter = k_after.arguments[0]
+
+                            elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1)
+                            elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1)
+                            matmul(elem_in_a, elem_in_b, elem_out)
+                            memA.release(ObjectFifoPort.Consume, 1)
+                            memB.release(ObjectFifoPort.Consume, 1)
+
+                            k_next = AddIOp(k_iter, c1_idx).result
+                            yield_([k_next])
+
+                        memC.release(ObjectFifoPort.Produce, 1)
+
+                        tile_next = AddIOp(tile_iter, c1_idx).result
+                        yield_([tile_next])
+
+            # Placed runtime_sequence using shim_dma_single_bd_task
+            @runtime_sequence(
+                np.ndarray[(A_sz,), np.dtype[dtype_in]],
+                np.ndarray[(B_sz,), np.dtype[dtype_in]],
+                np.ndarray[(C_sz,), np.dtype[dtype_out]],
+            )
+            def sequence(A, B, C):
+                if enable_tracing:
+                    trace_utils.configure_packet_tracing_aie2(
+                        tiles_to_trace=tiles_to_trace,
+                        shim=shim_tile,
+                        trace_size=trace_size,
+                        coretile_events=[
+                            make_port_event(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_0,
+                                0,
+                                master=True,
+                            ),
+                            make_port_event(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_1,
+                                1,
+                                master=True,
+                            ),
+                            make_port_event(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_2,
+                                0,
+                                master=False,
+                            ),
+                            trace_utils.events.CoreEvent.INSTR_EVENT_0,
+                            trace_utils.events.CoreEvent.INSTR_EVENT_1,
+                            trace_utils.events.CoreEvent.MEMORY_STALL,
+                            trace_utils.events.CoreEvent.LOCK_STALL,
+                            trace_utils.events.CoreEvent.INSTR_VECTOR,
+                        ],
+                    )
+
+                # Write RTP values for the static compilation size
+                npu_rtp_write("rtp", 0, K_div_k)
+                npu_rtp_write("rtp", 1, tiles)
+
+                # Use 2 tile rows per block (placed style, prevents BD exhaustion)
+                rows_per_block = 2
+
+                # Define tensor access patterns using TensorTiler2D
+                A_taps = TensorTiler2D.group_tiler(
+                    (M, K), (m, k), (1, K_div_k), pattern_repeat=N_div_n
+                )
+                b_tap = TensorTiler2D.group_tiler(
+                    (K, N),
+                    (k, n),
+                    (K_div_k, N_div_n),
+                    tile_group_col_major=True,
+                )[0]
+                C_taps = TensorTiler2D.group_tiler(
+                    (M, N), (m, n), (rows_per_block // 2, N_div_n)
+                )
+                c_index = 0
+
+                a_tasks = []
+                b_tasks = []
+                c_tasks = []
+
+                for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
+                    for pingpong in [0, 1]:
+                        row_base = (
+                            tile_row_block * rows_per_block
+                            + pingpong * rows_per_block // 2
+                        )
+                        num_tile_rows = min([rows_per_block // 2, M_div_m - row_base])
+                        if num_tile_rows <= 0:
+                            break
+
+                        # -- C --
+                        c_task = shim_dma_single_bd_task(
+                            outC,
+                            C,
+                            tap=C_taps[c_index],
+                            issue_token=True,
+                        )
+                        c_index += 1
+                        dma_start_task(c_task)
+                        c_tasks.append(c_task)
+
+                        for tile_row in range(num_tile_rows):
+                            # -- A --
+                            tile_offset = (row_base + tile_row) % len(A_taps)
+                            a_task = shim_dma_single_bd_task(
+                                inA, A, tap=A_taps[tile_offset]
+                            )
+                            dma_start_task(a_task)
+                            a_tasks.append(a_task)
+
+                            # -- B --
+                            b_task = shim_dma_single_bd_task(inB, B, tap=b_tap)
+                            dma_start_task(b_task)
+                            b_tasks.append(b_task)
+
+                        if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
+                            dma_await_task(c_tasks[-2])
+                            dma_free_task(a_tasks[-2])
+                            dma_free_task(b_tasks[-2])
+
+                dma_await_task(c_tasks[-1])
+
+                trace_utils.gen_trace_done_aie2(shim_tile)
+
+    print(ctx.module)
+
+
+if __name__ == "__main__":
+    main()
+else:
+    print("Not meant to be imported")
+    sys.exit(1)
diff --git a/programming_examples/basic/matrix_multiplication/single_core_dynamic/test_dynamic.cpp b/programming_examples/basic/matrix_multiplication/single_core_dynamic/test_dynamic.cpp
new file mode 100644
index 00000000000..f3113ecd2af
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/single_core_dynamic/test_dynamic.cpp
@@ -0,0 +1,221 @@
+//===- test_dynamic.cpp - Dynamic GEMM test harness -------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Test harness for dynamic single-core GEMM. Generates TXN instructions at
+// runtime for the specified M/K/N, using the same XCLBIN for all sizes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "generated_gemm_txn.h"
+
+#include "cxxopts.hpp"
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <stdfloat>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "common.h"
+
+using A_DATATYPE = std::bfloat16_t;
+using B_DATATYPE = std::bfloat16_t;
+using C_DATATYPE = float;
+using ACC_DATATYPE = float;
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("Dynamic GEMM Test");
+  options.add_options()("help,h", "produce help message")(
+      "xclbin,x", "the input xclbin path", cxxopts::value<std::string>())(
+      "kernel,k", "the kernel name in the XCLBIN",
+      cxxopts::value<std::string>()->default_value("MLIR_AIE"))(
+      "verbosity,v", "the verbosity of the output",
+      cxxopts::value<int>()->default_value("1"))(
+      "verify", "whether to verify the AIE computed output",
+      cxxopts::value<bool>()->default_value("true"))(
+      "rows,M", "Matrix rows M", cxxopts::value<int>()->default_value("64"))(
+      "inner,K", "Matrix inner dimension K",
+      cxxopts::value<int>()->default_value("64"))(
+      "columns,N", "Matrix columns N",
+      cxxopts::value<int>()->default_value("64"))(
+      "iters", "number of iterations",
+      cxxopts::value<int>()->default_value("1"))(
+      "warmup", "number of warmup iterations",
+      cxxopts::value<int>()->default_value("0"));
+
+  cxxopts::ParseResult vm;
+  try {
+    vm = options.parse(argc, argv);
+  } catch (const cxxopts::exceptions::parsing &e) {
+    std::cerr << e.what() << "\n\n" << options.help() << "\n";
+    return 1;
+  }
+  if (vm.count("help")) {
+    std::cout << options.help() << "\n";
+    return 0;
+  }
+  if (!vm.count("xclbin")) {
+    std::cerr << "Error: --xclbin is required\n" << options.help() << "\n";
+    return 1;
+  }
+  int verbosity = vm["verbosity"].as<int>();
+  bool do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup = vm["warmup"].as<int>();
+  int M = vm["M"].as<int>();
+  int K = vm["K"].as<int>();
+  int N = vm["N"].as<int>();
+
+  if (M <= 0 || K <= 0 || N <= 0) {
+    std::cerr << "Error: M, K, N must be positive\n";
+    return 1;
+  }
+  if (M % 32 != 0 || K % 32 != 0 || N % 32 != 0) {
+    std::cerr << "Error: M, K, N must be multiples of 32\n";
+    return 1;
+  }
+
+  srand(1726250518); // fixed seed for reproducibility
+
+  if (verbosity >= 1)
+    std::cout << "Dynamic GEMM: " << M << "x" << K << "x" << N << std::endl;
+
+  // Auto-generated TXN from aie-translate --aie-generate-txn-cpp
+  std::vector<uint32_t> instr_v =
+      generate_txn_sequence(static_cast<uint32_t>(M), static_cast<uint32_t>(K),
+                            static_cast<uint32_t>(N));
+  if (verbosity >= 1)
+    std::cout << "Generated " << instr_v.size()
+              << " instruction words (auto-generated)\n";
+
+  int A_VOLUME = M * K;
+  int B_VOLUME = K * N;
+  int C_VOLUME = M * N;
+  size_t A_SIZE = A_VOLUME * sizeof(A_DATATYPE);
+  size_t B_SIZE = B_VOLUME * sizeof(B_DATATYPE);
+  size_t C_SIZE = C_VOLUME * sizeof(C_DATATYPE);
+
+  // XRT setup
+  auto device = xrt::device(0);
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+  std::string node = vm["kernel"].as<std::string>();
+
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [&node](xrt::xclbin::kernel &xk) {
+                                 return xk.get_name().rfind(node, 0) == 0;
+                               });
+
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, xkernel.get_name());
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(uint32_t),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a =
+      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b =
+      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out =
+      xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+  auto bo_tmp = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
+  auto bo_trace =
+      xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7));
+
+  // Initialize buffers
+  A_DATATYPE *bufA = bo_a.map<A_DATATYPE *>();
+  std::vector<A_DATATYPE> AVec(A_VOLUME);
+  for (int i = 0; i < A_VOLUME; i++)
+    AVec[i] = matmul_common::get_random<A_DATATYPE>();
+  memcpy(bufA, AVec.data(), A_SIZE);
+
+  B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();
+  std::vector<B_DATATYPE> BVec(B_VOLUME);
+  for (int i = 0; i < B_VOLUME; i++)
+    BVec[i] = matmul_common::get_random<B_DATATYPE>() * i;
+  memcpy(bufB, BVec.data(), B_SIZE);
+
+  char *bufOut = bo_out.map<char *>();
+  memset(bufOut, 0, C_SIZE);
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(uint32_t));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned num_iter = n_iterations + n_warmup;
+  float npu_time_total = 0, npu_time_min = std::numeric_limits<float>::max(),
+        npu_time_max = 0;
+  int errors = 0;
+  float macs = 2.0f * M * K * N;
+
+  float abs_tol = matmul_common::get_abs_tol<C_DATATYPE>();
+  float rel_tol = matmul_common::get_rel_tol<C_DATATYPE>();
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+    if (verbosity >= 1)
+      std::cout << "Running kernel (iteration " << iter << ").\n";
+
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(3, bo_instr, instr_v.size(), bo_a, bo_b, bo_out, bo_tmp,
+                      bo_trace);
+    ert_cmd_state r = run.wait();
+    if (r != ERT_CMD_STATE_COMPLETED) {
+      std::cout << "Kernel did not complete. Status: " << r << "\n";
+      return 1;
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < (unsigned)n_warmup)
+      continue;
+
+    if (do_verify) {
+      std::vector<C_DATATYPE> CVec(C_VOLUME);
+      memcpy(CVec.data(), bufOut, C_SIZE);
+      if (verbosity >= 1)
+        std::cout << "Verifying against reference matmul ..." << std::endl;
+      errors = matmul_common::verify<A_DATATYPE, C_DATATYPE, ACC_DATATYPE>(
+          M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol);
+    }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+    npu_time_total += npu_time;
+    npu_time_min = std::min(npu_time, npu_time_min);
+    npu_time_max = std::max(npu_time, npu_time_max);
+  }
+
+  std::cout << "\nAvg NPU matmul time: " << npu_time_total / n_iterations
+            << "us.\n";
+  std::cout << "Avg NPU gflops: "
+            << macs / (1000 * npu_time_total / n_iterations) << "\n";
+  std::cout << "\nMin NPU matmul time: " << npu_time_min << "us.\n";
+  std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) << "\n";
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nError count: " << errors << "\n\nFailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/whole_array_iron.py b/programming_examples/basic/matrix_multiplication/whole_array/whole_array_iron.py
index 63445145b91..d05fc9a41c0 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/whole_array_iron.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/whole_array_iron.py
@@ -12,7 +12,6 @@
 from aie.iron.controlflow import range_
 from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 6489681d1e9..6fbaa483d73 100755
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -90,8 +90,68 @@ trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.bin
 	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir.prj/input_with_addresses.mlir --output trace_${targetname}.json
 	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
+# --- Dynamic TXN generation targets ---
+# Generate parameterized C++ TXN from dynamic Python design.
+# The generated function takes buffer_length as a runtime parameter.
+# Uses aiecc to run the full pipeline (placement, objectFIFO lowering, etc.)
+# and generate both XCLBIN and C++ TXN in one invocation.
+build/dynsize/aie_dynamic.mlir: ${srcdir}/passthrough_kernel_dynamic.py
+	mkdir -p ${@D}
+	python3 $< -d ${devicename} -i1s ${in1_size} -os ${out_size} > $@
+
+build/dynsize/final_dynamic.xclbin build/dynsize/generated_txn.h &: build/dynsize/aie_dynamic.mlir build/passThrough.cc.o
+	mkdir -p build/dynsize
+	cp build/passThrough.cc.o build/dynsize/
+	cd build/dynsize && aiecc --aie-generate-xclbin --xclbin-name=final_dynamic.xclbin \
+		--aie-generate-txn-cpp --txn-cpp-name=generated_txn.h \
+		--no-xchesscc --no-xbridge \
+		$(<:%=../../%)
+
+# Build the dynamic test executable (reuses test.cpp with USE_DYNAMIC_TXN)
+${targetname}_dynamic_${data_size}.exe: ${srcdir}/test.cpp build/dynsize/generated_txn.h
+	rm -rf _build_dynamic
+	mkdir -p _build_dynamic
+	cd _build_dynamic && ${powershell} cmake `${getwslpath} ${srcdir}` \
+		-DTARGET_NAME=${targetname}_dynamic \
+		-DIN1_SIZE=${in1_size} -DOUT_SIZE=${out_size} \
+		-DCMAKE_CXX_FLAGS="-DUSE_DYNAMIC_TXN -I${CURDIR}/build/dynsize -I$(shell cd ${srcdir}/../../.. && pwd)/include"
+	cd _build_dynamic && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build_dynamic/${targetname}_dynamic.exe $@
+else
+	cp _build_dynamic/${targetname}_dynamic $@
+endif
+
+# Build dynamic-size executable using parameterized TXN (takes --dynamic-size at runtime)
+max_size = 4096
+${targetname}_dynsize.exe: ${srcdir}/test.cpp build/dynsize/generated_txn.h
+	rm -rf _build_dynsize
+	mkdir -p _build_dynsize
+	cd _build_dynsize && ${powershell} cmake `${getwslpath} ${srcdir}` \
+		-DTARGET_NAME=${targetname}_dynsize \
+		-DIN1_SIZE=${max_size} -DOUT_SIZE=${max_size} \
+		-DCMAKE_CXX_FLAGS="-DUSE_DYNAMIC_TXN -I${CURDIR}/build/dynsize -I$(shell cd ${srcdir}/../../.. && pwd)/include"
+	cd _build_dynsize && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build_dynsize/${targetname}_dynsize.exe $@
+else
+	cp _build_dynsize/${targetname}_dynsize $@
+endif
+
+# Run with dynamically generated TXN and its own XCLBIN
+run_dynamic: ${targetname}_dynamic_${data_size}.exe build/dynsize/final_dynamic.xclbin
+	${powershell} ./$< -x build/dynsize/final_dynamic.xclbin -k MLIR_AIE
+
+# Build XCLBIN at max size, then run at multiple sizes without recompilation
+run_dynamic_sizes: ${targetname}_dynsize.exe build/final_${data_size}.xclbin
+	@for sz in 1024 4096 16384 ${in1_size}; do \
+		echo "=== Dynamic size: $$sz bytes ==="; \
+		${powershell} ./$< -x build/final_${data_size}.xclbin -k MLIR_AIE --dynamic-size $$sz; \
+		echo ""; \
+	done
+
 clean_trace:
 	rm -rf tmpTrace trace.txt parse*json trace*json
 
 clean: clean_trace
-	rm -rf build _build ${targetname}*.exe
+	rm -rf build _build _build_dynamic _build_dynsize ${targetname}*.exe
diff --git a/programming_examples/basic/passthrough_kernel/passthrough_kernel.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel.py
index 9fef8261973..e12427cbe1b 100644
--- a/programming_examples/basic/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/basic/passthrough_kernel/passthrough_kernel.py
@@ -11,9 +11,10 @@
 
 from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
 from aie.iron.device import NPU1Col1, NPU2
+from aie.extras import types as T
 
 
-def my_passthrough_kernel(dev, in1_size, out_size, trace_size):
+def my_passthrough_kernel(dev, in1_size, out_size, trace_size, dynamic_txn=False):
     in1_dtype = np.uint8
     out_dtype = np.uint8
 
@@ -52,48 +53,86 @@ def core_fn(of_in, of_out, passThroughLine):
 
     # Runtime operations to move data to/from the AIE-array
     rt = Runtime()
-    with rt.sequence(vector_type, vector_type, vector_type) as (a_in, b_out, _):
-        rt.enable_trace(trace_size)
-        rt.start(my_worker)
-        rt.fill(of_in.prod(), a_in)
-        rt.drain(of_out.cons(), b_out, wait=True)
+    if dynamic_txn:
+        with rt.sequence(vector_type, vector_type, vector_type, T.i32) as (
+            a_in,
+            b_out,
+            _,
+            buffer_length,
+        ):
+            rt.enable_trace(trace_size)
+            rt.start(my_worker)
+            rt.fill(of_in.prod(), a_in, sizes=[1, 1, 1, buffer_length])
+            rt.drain(
+                of_out.cons(),
+                b_out,
+                sizes=[1, 1, 1, buffer_length],
+                wait=True,
+            )
+    else:
+        with rt.sequence(vector_type, vector_type, vector_type) as (a_in, b_out, _):
+            rt.enable_trace(trace_size)
+            rt.start(my_worker)
+            rt.fill(of_in.prod(), a_in)
+            rt.drain(of_out.cons(), b_out, wait=True)
 
     # Place components (assign the resources on the device) and generate an MLIR module
     return Program(dev, rt).resolve_program()
 
 
-p = argparse.ArgumentParser()
-p.add_argument("-d", "--dev", required=True, dest="device", help="AIE Device")
-p.add_argument(
-    "-i1s", "--in1_size", required=True, dest="in1_size", help="Input 1 size"
-)
-p.add_argument("-os", "--out_size", required=True, dest="out_size", help="Output size")
-p.add_argument(
-    "-t",
-    "--trace_size",
-    required=False,
-    dest="trace_size",
-    default=0,
-    help="Trace buffer size",
-)
-opts = p.parse_args(sys.argv[1:])
-
-if opts.device == "npu":
-    dev = NPU1Col1()
-elif opts.device == "npu2":
-    dev = NPU2()
-else:
-    raise ValueError("[ERROR] Device name {} is unknown".format(opts.device))
-
-in1_size = int(opts.in1_size)
-if in1_size % 64 != 0 or in1_size < 512:
+def main(argv=None):
+    p = argparse.ArgumentParser()
+    p.add_argument("-d", "--dev", required=True, dest="device", help="AIE Device")
+    p.add_argument(
+        "-i1s", "--in1_size", required=True, dest="in1_size", help="Input 1 size"
+    )
+    p.add_argument(
+        "-os", "--out_size", required=True, dest="out_size", help="Output size"
+    )
+    p.add_argument(
+        "-t",
+        "--trace_size",
+        required=False,
+        dest="trace_size",
+        default=0,
+        help="Trace buffer size",
+    )
+    p.add_argument(
+        "--dynamic-txn",
+        action="store_true",
+        default=False,
+        help="Emit a runtime-parameterized runtime_sequence for TXN C++ generation",
+    )
+    opts = p.parse_args(sys.argv[1:] if argv is None else argv)
+
+    if opts.device == "npu":
+        dev = NPU1Col1()
+    elif opts.device == "npu2":
+        dev = NPU2()
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(opts.device))
+
+    in1_size = int(opts.in1_size)
+    if in1_size % 64 != 0 or in1_size < 512:
+        print(
+            "In1 buffer size ("
+            + str(in1_size)
+            + ") must be a multiple of 64 and greater than or equal to 512"
+        )
+        raise ValueError
+    out_size = int(opts.out_size)
+    trace_size = int(opts.trace_size)
+
     print(
-        "In1 buffer size ("
-        + str(in1_size)
-        + ") must be a multiple of 64 and greater than or equal to 512"
+        my_passthrough_kernel(
+            dev,
+            in1_size,
+            out_size,
+            trace_size,
+            dynamic_txn=opts.dynamic_txn,
+        )
     )
-    raise ValueError
-out_size = int(opts.out_size)
-trace_size = int(opts.trace_size)
 
-print(my_passthrough_kernel(dev, in1_size, out_size, trace_size))
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/basic/passthrough_kernel/passthrough_kernel_dynamic.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel_dynamic.py
new file mode 100644
index 00000000000..11dc4f3eac5
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/passthrough_kernel_dynamic.py
@@ -0,0 +1,33 @@
+# passthrough_kernel/passthrough_kernel_dynamic.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
+#
+"""Dynamic passthrough TXN generation using the same IRON API as the static example."""
+
+import argparse
+
+from aie.iron.device import NPU1, NPU2
+
+from passthrough_kernel import my_passthrough_kernel
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("-d", "--device", choices=["npu", "npu2"], default="npu2")
+    p.add_argument("-i1s", "--in1_size", type=int, default=4096)
+    p.add_argument("-os", "--out_size", type=int, default=4096)
+    args = p.parse_args()
+
+    dev = NPU2() if args.device == "npu2" else NPU1()
+    print(
+        my_passthrough_kernel(
+            dev,
+            args.in1_size,
+            args.out_size,
+            0,
+            dynamic_txn=True,
+        )
+    )
diff --git a/programming_examples/basic/passthrough_kernel/run_makefile_dynamic.lit b/programming_examples/basic/passthrough_kernel/run_makefile_dynamic.lit
new file mode 100644
index 00000000000..e736ac37396
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/run_makefile_dynamic.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai_npu1, peano
+//
+// RUN: mkdir -p test_dynamic
+// RUN: cd test_dynamic
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile devicename=npu
+// RUN: %run_on_npu1% make -f %S/Makefile run_dynamic devicename=npu
diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile_dynamic.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile_dynamic.lit
new file mode 100644
index 00000000000..f3849577292
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile_dynamic.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// RUN: mkdir -p test_dynamic_stx
+// RUN: cd test_dynamic_stx
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile devicename=npu2
+// RUN: %run_on_npu2% make -f %S/Makefile run_dynamic devicename=npu2
diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp
index 4841543b34f..74a7ad7ed45 100644
--- a/programming_examples/basic/passthrough_kernel/test.cpp
+++ b/programming_examples/basic/passthrough_kernel/test.cpp
@@ -11,6 +11,10 @@
 #include "xrt_test_wrapper.h"
 #include <cstdint>
 
+#ifdef USE_DYNAMIC_TXN
+#include "generated_txn.h"
+#endif
+
 //*****************************************************************************
 // Modify this section to customize buffer datatypes, initialization functions,
 // and verify function. The other place to reconfigure your design is the
@@ -63,13 +67,31 @@ int verify_passthrough_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
 
 int main(int argc, const char *argv[]) {
 
-  constexpr int IN1_VOLUME = IN1_SIZE / sizeof(DATATYPE_IN1);
-  constexpr int OUT_VOLUME = OUT_SIZE / sizeof(DATATYPE_OUT);
+  int in1_volume = IN1_SIZE / sizeof(DATATYPE_IN1);
+  int out_volume = OUT_SIZE / sizeof(DATATYPE_OUT);
 
   args myargs = parse_args(argc, argv);
 
+#ifdef USE_DYNAMIC_TXN
+  // If --dynamic-size is given, override the compiled-in buffer sizes
+  // and generate TXN instructions for that size at runtime.
+  if (myargs.dynamic_size > 0) {
+    uint32_t size_bytes = myargs.dynamic_size;
+    in1_volume = size_bytes / sizeof(DATATYPE_IN1);
+    out_volume = size_bytes / sizeof(DATATYPE_OUT);
+    myargs.generate_instr = [size_bytes]() {
+      return generate_txn_sequence(size_bytes);
+    };
+  } else {
+    myargs.generate_instr = []() {
+      // Default: use compile-time size
+      return generate_txn_sequence(IN1_SIZE);
+    };
+  }
+#endif
+
   int res = setup_and_run_aie<DATATYPE_IN1, DATATYPE_OUT, initialize_bufIn1,
                               initialize_bufOut, verify_passthrough_kernel>(
-      IN1_VOLUME, OUT_VOLUME, myargs);
+      in1_volume, out_volume, myargs);
   return res;
 }
diff --git a/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_placed.py b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_placed.py
index 4f41af9d4d2..ccd132bd720 100644
--- a/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_placed.py
+++ b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel_placed.py
@@ -14,7 +14,6 @@
 from aie.helpers.dialects.func import func
 from aie.iron.controlflow import range_
 
-
 dev = AIEDevice.npu1_1col
 
 if len(sys.argv) > 2:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f95babc7273..e371f7cadd3 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -542,4 +542,4 @@ add_custom_command(
   ${CMAKE_BINARY_DIR}/bin
 )
 # during install
-install(PROGRAMS compiler/aiecc.py compiler/txn2mlir.py DESTINATION bin)
\ No newline at end of file
+install(PROGRAMS compiler/aiecc.py compiler/txn2mlir.py DESTINATION bin)
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index 783d3d91f93..d4a2be32532 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -10,6 +10,13 @@
 
 from ._aie_enum_gen import *
 from ._aie_ops_gen import *
+
+
+# TraceShimRouting enum (from AIETraceAttrs.td, not yet auto-generated)
+class TraceShimRouting(IntEnum):
+    Single = 0
+
+
 from ._aie_ops_gen import _Dialect
 from ._ods_common import _cext
 from .func import FuncOp
@@ -89,7 +96,12 @@ def __init__(self, buffer, index, value, loc=None, ip=None):
         buff_name = buffer
         if isinstance(buffer, BufferOp):
             buff_name = buffer.sym_name.value
-        super().__init__(buffer=buff_name, index=index, value=value, loc=loc, ip=ip)
+        if isinstance(value, Value):
+            super().__init__(
+                buffer=buff_name, index=index, dyn_value=value, loc=loc, ip=ip
+            )
+        else:
+            super().__init__(buffer=buff_name, index=index, value=value, loc=loc, ip=ip)
 
 
 class external_func(FuncOp):
diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py
index 409a135273d..1e2240ae624 100644
--- a/python/dialects/aiex.py
+++ b/python/dialects/aiex.py
@@ -23,13 +23,17 @@
 from ..helpers.util import v8bfp16ebs8, v16bfp16ebs16
 from ..ir import (
     DictAttr,
+    IndexType,
     IntegerAttr,
+    IntegerType,
     UnitAttr,
     Type,
+    Value,
     InsertionPoint,
     Attribute,
     AttrBuilder,
 )
+from . import arith as _arith
 
 # noinspection PyUnresolvedReferences
 from ..extras import types as T
@@ -54,6 +58,39 @@ def dma_wait(*args: ObjectFifoCreateOp | str):
         npu_dma_wait(str_name)
 
 
+def _cast_to_i64(v):
+    """Coerce an SSA Value to i64 for NPU DMA descriptor operands.
+
+    The npu.dma_memcpy_nd op currently declares offsets/sizes/strides as
+    Variadic<I64>. Front-end arithmetic, however, may naturally produce
+    `index` (from scf.for induction vars), `i32` (from runtime_sequence
+    args), or any other signless-integer width. This helper inserts the
+    canonical conversion at the call site so callers can pass in whatever
+    SSA value falls out of their computation without manual casts.
+
+    TODO(future PR): switch this to _cast_to_i64 once AIEX.td tightens the
+    operand type to Variadic<I32> (matching the NPU descriptor register
+    width). AIEDmaToNpu's getAsValue already does width coercion in either
+    direction, so the lowering does not care which width the IR carries.
+    """
+    if not isinstance(v, Value):
+        return v
+    i64 = IntegerType.get_signless(64)
+    vt = v.type
+    if vt == i64:
+        return v
+    if isinstance(vt, IndexType):
+        return _arith.index_cast(i64, v)
+    if isinstance(vt, IntegerType):
+        if vt.width > 64:
+            return _arith.trunci(i64, v)
+        return _arith.extui(i64, v)
+    raise TypeError(
+        f"npu_dma_memcpy_nd offsets/sizes/strides must be index or signless "
+        f"integer, got {vt}"
+    )
+
+
 class NpuDmaMemcpyNd(NpuDmaMemcpyNdOp):
     """
     Enables data transfers between the AIE Engine array and external memory.
@@ -122,6 +159,14 @@ def __init__(
         dynamic_strides, _packed_strides, static_strides = _dispatch_mixed_values(
             strides
         )
+        # The op operands $offsets/$sizes/$strides are Variadic<I32> (matching
+        # the NPU descriptor register width). Whatever the user's SSA
+        # arithmetic produced (index from scf.for, i64 from a wider compute,
+        # iN from a custom path), normalise it to i32 here so callers do not
+        # have to insert arith.index_cast / trunci / extui themselves.
+        dynamic_offsets = [_cast_to_i64(v) for v in dynamic_offsets]
+        dynamic_sizes = [_cast_to_i64(v) for v in dynamic_sizes]
+        dynamic_strides = [_cast_to_i64(v) for v in dynamic_strides]
         if isinstance(metadata, ObjectFifoCreateOp):
             metadata = metadata.sym_name.value
         super().__init__(
@@ -143,6 +188,79 @@ def __init__(
 npu_dma_memcpy_nd = NpuDmaMemcpyNd
 
 
+# Dynamic convenience wrappers
+# These create unified ops with SSA value operands for runtime-parameterized
+# sequences. The static attributes serve as placeholders (0) and the dynamic
+# SSA values override them at runtime.
+
+
+def npu_write32_dynamic(dyn_address, dyn_value, *, buffer=None, column=None, row=None):
+    """write32 with SSA value operands for runtime-parameterized sequences."""
+    return NpuWrite32Op(
+        address=0,
+        value=0,
+        buffer=buffer,
+        column=column,
+        row=row,
+        dyn_address=dyn_address,
+        dyn_value=dyn_value,
+    )
+
+
+def npu_maskwrite32_dynamic(
+    dyn_address, dyn_value, dyn_mask, *, buffer=None, column=None, row=None
+):
+    """maskwrite32 with SSA value operands for runtime-parameterized sequences."""
+    return NpuMaskWrite32Op(
+        address=0,
+        value=0,
+        mask=0,
+        buffer=buffer,
+        column=column,
+        row=row,
+        dyn_address=dyn_address,
+        dyn_value=dyn_value,
+        dyn_mask=dyn_mask,
+    )
+
+
+def npu_sync_dynamic(
+    dyn_column, dyn_row, dyn_direction, dyn_channel, dyn_column_num, dyn_row_num
+):
+    """sync with SSA value operands for runtime-parameterized sequences."""
+    return NpuSyncOp(
+        column=0,
+        row=0,
+        direction=0,
+        channel=0,
+        column_num=0,
+        row_num=0,
+        dyn_column=dyn_column,
+        dyn_row=dyn_row,
+        dyn_direction=dyn_direction,
+        dyn_channel=dyn_channel,
+        dyn_column_num=dyn_column_num,
+        dyn_row_num=dyn_row_num,
+    )
+
+
+# Override auto-generated npu_rtp_write to support SSA values
+
+
+def npu_rtp_write(buffer, index, value, *, loc=None, ip=None):
+    """RTP write supporting both static int and dynamic SSA Value.
+
+    When value is a Python int, it is passed as a static I32Attr.
+    When value is an SSA Value (i32), it is passed as the dyn_value operand.
+    """
+    if isinstance(value, Value):
+        return NpuWriteRTPOp(
+            buffer=buffer, index=index, dyn_value=value, loc=loc, ip=ip
+        )
+    else:
+        return NpuWriteRTPOp(buffer=buffer, index=index, value=value, loc=loc, ip=ip)
+
+
 # Runtime sequence
 
 
diff --git a/python/iron/program.py b/python/iron/program.py
index bc0075b6614..4502d15887e 100644
--- a/python/iron/program.py
+++ b/python/iron/program.py
@@ -132,4 +132,4 @@ def device_body():
     def _print_verify(self, ctx):
         verify = ctx.module.operation.verify()
         if verify != True:
-            logger.error(str(verify))
+            raise RuntimeError(f"MLIR verification failed: {verify}")
diff --git a/python/iron/runtime/data.py b/python/iron/runtime/data.py
index f64cf31775c..05dbe2dfc7f 100644
--- a/python/iron/runtime/data.py
+++ b/python/iron/runtime/data.py
@@ -15,6 +15,7 @@
     np_ndarray_type_get_shape,
 )
 from ...helpers.taplib import TensorAccessPattern, TensorTiler2D
+from ...ir import Type as MlirType
 
 
 class RuntimeData:
@@ -60,3 +61,37 @@ def op(self, op: MemRefValue):
         if self._op:
             raise ValueError("Cannot set operation for RuntimeData more than once.")
         self._op = op
+
+
+class RuntimeScalar:
+    """A handle to a scalar runtime parameter (e.g. T.i32()) in the Runtime sequence."""
+
+    def __init__(self, mlir_type: MlirType):
+        """Construct a handle to a scalar Runtime parameter.
+
+        Args:
+            mlir_type (MlirType): The MLIR type of the scalar, or a zero-arg
+                callable that produces one within an active MLIR context.
+        """
+        self._mlir_type = mlir_type
+        self._op = None
+
+    @property
+    def mlir_type(self) -> MlirType:
+        """The MLIR type of this scalar."""
+        if callable(self._mlir_type):
+            return self._mlir_type()
+        return self._mlir_type
+
+    @property
+    def op(self):
+        """The MLIR SSA value for this scalar, set during resolve."""
+        if self._op is None:
+            raise ValueError("Cannot get operation for RuntimeScalar before it is set.")
+        return self._op
+
+    @op.setter
+    def op(self, op):
+        if self._op is not None:
+            raise ValueError("Cannot set operation for RuntimeScalar more than once.")
+        self._op = op
diff --git a/python/iron/runtime/dmatask.py b/python/iron/runtime/dmatask.py
index 958f6a04df2..47b067a1f8b 100644
--- a/python/iron/runtime/dmatask.py
+++ b/python/iron/runtime/dmatask.py
@@ -8,11 +8,13 @@
 """DMATask: a RuntimeTask that generates a shim DMA transfer operation."""
 
 from ... import ir  # type: ignore
+from ...ir import Value  # type: ignore
 
-from ...dialects._aiex_ops_gen import dma_start_task  # type: ignore
+from ...dialects.arith import ExtUIOp
+from ...dialects.aiex import dma_free_task, dma_start_task, dma_wait, npu_dma_memcpy_nd
 from ...dialects.aiex import shim_dma_single_bd_task
 from ..dataflow import ObjectFifoHandle
-from .data import RuntimeData
+from .data import RuntimeData, RuntimeScalar
 from ...helpers.taplib import TensorAccessPattern
 from .task import RuntimeTask
 from .taskgroup import RuntimeTaskGroup
@@ -23,9 +25,12 @@ def __init__(
         self,
         object_fifo: ObjectFifoHandle,
         rt_data: RuntimeData,
-        tap: TensorAccessPattern,
+        tap: TensorAccessPattern | None = None,
         task_group: RuntimeTaskGroup | None = None,
         wait: bool = False,
+        offset=None,
+        sizes=None,
+        strides=None,
     ):
         """A RuntimeTask that will resolve to a DMA Operation.
 
@@ -35,12 +40,24 @@ def __init__(
             tap (TensorAccessPattern): The access pattern associated with the operation.
             task_group (RuntimeTaskGroup | None, optional): The task group associated with the operation. Defaults to None.
             wait (bool, optional): Whether this task should conclude with a call to await or a call to free. Defaults to False.
+            offset (int | None, optional): Byte offset into the runtime buffer for the start of the transfer. Mutually exclusive with ``tap``. Defaults to None.
+            sizes (list[int] | None, optional): Multi-dimensional transfer sizes (up to 4D) describing the shape of each DMA tile. Mutually exclusive with ``tap``. Defaults to None.
+            strides (list[int] | None, optional): Multi-dimensional strides (in element granularity) corresponding to ``sizes``. Mutually exclusive with ``tap``. Defaults to None.
         """
         self._object_fifo = object_fifo
         self._rt_data = rt_data
         self._tap = tap
         self._wait = wait
         self._task = None
+        self._offset = offset
+        self._sizes = sizes
+        self._strides = strides
+        self._bd_id = None
+        if tap and not (offset is None and sizes is None and strides is None):
+            raise ValueError(
+                "DMATask can take either a TensorAccessPattern OR "
+                "(offset and/or sizes and/or strides), but not both."
+            )
         RuntimeTask.__init__(self, task_group)
 
     def will_wait(self) -> bool:
@@ -60,15 +77,116 @@ def task(self):
             raise ValueError("Cannot get task before it is created (during resolve())")
         return self._task
 
+    @property
+    def bd_id(self) -> int:
+        if self._bd_id is None:
+            raise ValueError("Cannot get bd_id before it is assigned.")
+        return self._bd_id
+
+    @property
+    def bd_allocation_key(self):
+        tile = self._object_fifo.endpoint.tile
+        if tile.col is not None and tile.row is not None:
+            return (tile.col, tile.row, tile.tile_type)
+        return ("unplaced", tile.tile_type)
+
+    def uses_direct_npu_dma(self) -> bool:
+        return any(
+            self._contains_runtime_values(v)
+            for v in (self._offset, self._sizes, self._strides)
+        )
+
+    def emit_wait(self) -> None:
+        if self.uses_direct_npu_dma():
+            dma_wait(self._object_fifo.op)
+            return
+        from ...dialects.aiex import dma_await_task
+
+        dma_await_task(self._task)
+
+    def emit_free(self) -> None:
+        if self.uses_direct_npu_dma():
+            # Direct NPU DMA tasks do not use dma_free_task; synchronization
+            # is handled by dma_wait in emit_wait().
+            return
+        dma_free_task(self._task)
+
+    @staticmethod
+    def _contains_runtime_values(value) -> bool:
+        if isinstance(value, (RuntimeScalar, Value)):
+            return True
+        if isinstance(value, (list, tuple)):
+            return any(DMATask._contains_runtime_values(v) for v in value)
+        return False
+
+    @staticmethod
+    def _resolve_runtime_values(value):
+        if isinstance(value, RuntimeScalar):
+            return DMATask._to_i64_value(value.op)
+        if isinstance(value, Value):
+            return DMATask._to_i64_value(value)
+        if isinstance(value, list):
+            return [DMATask._resolve_runtime_values(v) for v in value]
+        if isinstance(value, tuple):
+            return tuple(DMATask._resolve_runtime_values(v) for v in value)
+        return value
+
+    @staticmethod
+    def _to_i64_value(value: Value) -> Value:
+        i64_ty = ir.IntegerType.get_signless(64)
+        if value.type == i64_ty:
+            return value
+        return ExtUIOp(i64_ty, value).result
+
     def resolve(
         self,
         loc: ir.Location | None = None,
         ip: ir.InsertionPoint | None = None,
+        bd_id: int | None = None,
     ) -> None:
+        if self.uses_direct_npu_dma():
+            if bd_id is None:
+                raise ValueError("Direct NPU DMA lowering requires an assigned bd_id.")
+            self._bd_id = bd_id
+
+            if self._tap is not None:
+                self._task = npu_dma_memcpy_nd(
+                    self._object_fifo.op,
+                    bd_id,
+                    self._rt_data.op,
+                    tap=self._tap,
+                    issue_token=self._wait,
+                )
+                return
+
+            sizes = self._resolve_runtime_values(self._sizes)
+            if sizes is None:
+                raise ValueError(
+                    "Direct NPU DMA lowering requires explicit sizes or a tap."
+                )
+            strides = self._resolve_runtime_values(self._strides)
+            offset = self._resolve_runtime_values(self._offset)
+            if offset is None:
+                offset = 0
+            offsets = [0, 0, 0, offset]
+            self._task = npu_dma_memcpy_nd(
+                self._object_fifo.op,
+                bd_id,
+                self._rt_data.op,
+                offsets=offsets,
+                sizes=sizes,
+                strides=strides,
+                issue_token=self._wait,
+            )
+            return
+
         self._task = shim_dma_single_bd_task(
             self._object_fifo.op,
             self._rt_data.op,
             tap=self._tap,
+            offset=self._offset,
+            sizes=self._sizes,
+            strides=self._strides,
             issue_token=self._wait,
         )
         dma_start_task(self._task)
diff --git a/python/iron/runtime/runtime.py b/python/iron/runtime/runtime.py
index a204f9e478c..5e7172754b1 100644
--- a/python/iron/runtime/runtime.py
+++ b/python/iron/runtime/runtime.py
@@ -8,7 +8,8 @@
 """Runtime: orchestrates host-side data movement and worker execution for an IRON program."""
 
 from __future__ import annotations
-from collections import defaultdict
+import bisect
+from collections import defaultdict, deque
 from contextlib import contextmanager
 import logging
 import numpy as np
@@ -22,14 +23,13 @@
 
 from ...dialects.aie import tile
 from ...dialects.aiex import runtime_sequence
-from ...dialects._aiex_ops_gen import dma_await_task, dma_free_task  # type: ignore
 from ...helpers.taplib import TensorAccessPattern
 from ..dataflow import ObjectFifoHandle
 from ..device import Tile, AnyShimTile
 from ..resolvable import Resolvable
 from ..worker import Worker, WorkerRuntimeBarrier, _BarrierSetOp
 from .dmatask import DMATask
-from .data import RuntimeData
+from .data import RuntimeData, RuntimeScalar
 from .endpoint import RuntimeEndpoint
 from .taskgroup import RuntimeTaskGroup
 from .task import (
@@ -37,17 +37,41 @@
     RuntimeStartTask,
     InlineOpRuntimeTask,
     FinishTaskGroupTask,
+    RtpWriteTask,
 )
 
 
+class _RuntimeBdIdAllocator:
+    """Simple runtime-sequence BD allocator for direct npu.dma_memcpy_nd emission."""
+
+    def __init__(self, max_bd_ids_per_key: int = 16):
+        self._max_bd_ids_per_key = max_bd_ids_per_key
+        self._next_id = defaultdict(int)
+        self._free_ids = defaultdict(deque)
+
+    def allocate(self, key) -> int:
+        if self._free_ids[key]:
+            return self._free_ids[key].popleft()
+        bd_id = self._next_id[key]
+        if bd_id >= self._max_bd_ids_per_key:
+            raise ValueError(
+                f"Runtime BD allocator exhausted available IDs for key {key}."
+            )
+        self._next_id[key] += 1
+        return bd_id
+
+    def free(self, key, bd_id: int) -> None:
+        free_ids = self._free_ids[key]
+        if bd_id in free_ids:
+            return
+        bisect.insort(free_ids, bd_id)
+
+
 class Runtime(Resolvable):
     """A Runtime contains that operations and structure of all operations that
     need to be taken care of by the host/runtime in order to run a program.
     """
 
-    # Used to generate unique task group IDs within this Runtime.
-    __task_group_index = 0
-
     def __init__(
         self,
         strict_task_groups: bool = True,
@@ -68,25 +92,44 @@ def __init__(
         self._trace_workers = None
         self._strict_task_groups = strict_task_groups
         self._ddr_id = 4
+        self.__task_group_index = 0
 
     @contextmanager
-    def sequence(self, *input_types: type[np.ndarray]):
+    def sequence(self, *input_types):
         """A RuntimeSequence is a sequence of operations that are performed in
         support of a program. Common operations include input and output data movement.
 
+        Args:
+            *input_types: numpy ndarray types for buffer parameters, or MLIR types
+                (e.g. IntegerType) for scalar parameters.
+
         Raises:
             ValueError: Arguments are validated.
             ValueError: If task groups are not finished within the sequence() context, and error will be raised.
 
         Yields:
-            RuntimeData | tuple[RuntimeData, ...]: Handles to the runtime buffers matching the declared input types.
+            _type_: Handles to the buffers/scalars matching the input types.
         """
         try:
-            self._rt_data = list(map(RuntimeData, input_types))
-            if len(self._rt_data) == 1:
-                yield self._rt_data[0]
+            items = []
+            for t in input_types:
+                if isinstance(t, ir.Type):
+                    items.append(RuntimeScalar(t))
+                elif getattr(t, "__origin__", None) is np.ndarray:
+                    items.append(RuntimeData(t))
+                elif callable(t):
+                    items.append(RuntimeScalar(t))
+                else:
+                    raise TypeError(
+                        f"Unsupported sequence argument type: {type(t).__name__}. "
+                        f"Expected np.ndarray type, ir.Type, or callable."
+                    )
+            self._sequence_items = items
+            self._rt_data = [i for i in items if isinstance(i, RuntimeData)]
+            if len(items) == 1:
+                yield items[0]
             else:
-                yield tuple(self._rt_data.copy())
+                yield tuple(items)
         finally:
             if len(self._open_task_groups) != 0:
                 tgs_str = ", ".join([str(t) for t in self._open_task_groups])
@@ -143,6 +186,9 @@ def fill(
         in_fifo: ObjectFifoHandle,
         source: RuntimeData,
         tap: TensorAccessPattern | None = None,
+        offset=None,
+        sizes=None,
+        strides=None,
         task_group: RuntimeTaskGroup | None = None,
         wait: bool = False,
         tile: Tile = AnyShimTile,
@@ -155,6 +201,11 @@ def fill(
             source (RuntimeData): The input Runtime data buffer.
             tap (TensorAccessPattern | None, optional): A way of specifying how data in the buffer is accessed when sending it to the in_fifo.
                 If None is given, this will default to a linear transfer containing all data in the source buffer. Defaults to None.
+            offset (optional): Starting element offset for the transfer.
+            sizes (optional): Four-dimensional transfer sizes. RuntimeScalar
+                values are allowed and trigger direct npu.dma_memcpy_nd lowering.
+            strides (optional): Four-dimensional transfer strides. RuntimeScalar
+                values are allowed and trigger direct npu.dma_memcpy_nd lowering.
             task_group (RuntimeTaskGroup | None, optional): A TaskGroup to associate this task with. Defaults to None.
             wait (bool, optional): Whether this Task should be awaited on or not. If not, it will be freed when the task group is finished. Defaults to False.
             tile (Tile | None, optional): The Shim tile to associate the data transfer with. Defaults to AnyShimTile.
@@ -168,18 +219,32 @@ def fill(
             )
         rt_endpoint = RuntimeEndpoint(tile)
 
-        if tap is None:
+        if tap is None and offset is None and sizes is None and strides is None:
             tap = source.default_tap()
 
         in_fifo.endpoint = rt_endpoint
         self._fifos.add(in_fifo)
-        self._tasks.append(DMATask(in_fifo, source, tap, task_group, wait))
+        self._tasks.append(
+            DMATask(
+                in_fifo,
+                source,
+                tap,
+                task_group,
+                wait,
+                offset=offset,
+                sizes=sizes,
+                strides=strides,
+            )
+        )
 
     def drain(
         self,
         out_fifo: ObjectFifoHandle,
         dest: RuntimeData,
         tap: TensorAccessPattern | None = None,
+        offset=None,
+        sizes=None,
+        strides=None,
         task_group: RuntimeTaskGroup | None = None,
         wait: bool = False,
         tile: Tile = AnyShimTile,
@@ -192,6 +257,11 @@ def drain(
             dest (RuntimeData): The output Runtime data buffer.
             tap (TensorAccessPattern | None, optional): A way of specifying how data in the buffer is accessed when reading from the out_fifo.
                 If None is given, this will default to a linear transfer containing all data in the destination buffer. Defaults to None.
+            offset (optional): Starting element offset for the transfer.
+            sizes (optional): Four-dimensional transfer sizes. RuntimeScalar
+                values are allowed and trigger direct npu.dma_memcpy_nd lowering.
+            strides (optional): Four-dimensional transfer strides. RuntimeScalar
+                values are allowed and trigger direct npu.dma_memcpy_nd lowering.
             task_group (RuntimeTaskGroup | None, optional): A TaskGroup to associate this task with. Defaults to None.
             wait (bool, optional): Whether this Task should be awaited on or not. If not, it will be freed when the task group is finished. Defaults to False.
             tile (Tile | None, optional): The Shim tile to associate the data transfer with. Defaults to AnyShimTile.
@@ -205,12 +275,23 @@ def drain(
             )
         rt_endpoint = RuntimeEndpoint(tile)
 
-        if tap is None:
+        if tap is None and offset is None and sizes is None and strides is None:
             tap = dest.default_tap()
 
         out_fifo.endpoint = rt_endpoint
         self._fifos.add(out_fifo)
-        self._tasks.append(DMATask(out_fifo, dest, tap, task_group, wait))
+        self._tasks.append(
+            DMATask(
+                out_fifo,
+                dest,
+                tap,
+                task_group,
+                wait,
+                offset=offset,
+                sizes=sizes,
+                strides=strides,
+            )
+        )
 
     def start(self, *args: Worker):
         """A placeholder operation to indicate that one or more Worker should be started on the device.
@@ -311,14 +392,37 @@ def get_first_cons_shimtile(self):
                 if endpoint_tile.row == 0:
                     return endpoint_tile.op
 
+    def write_rtp(self, buf, index: int, value):
+        """Queue an RTP write to a buffer.
+
+        Args:
+            buf: A Buffer object (with _name attribute) or a string buffer name.
+            index (int): The index within the RTP buffer to write.
+            value: The value to write (integer constant or RuntimeScalar).
+        """
+        buf_name = buf._name if hasattr(buf, "_name") else str(buf)
+        self._tasks.append(RtpWriteTask(buf_name, index, value))
+
     def resolve(
         self,
         loc: ir.Location | None = None,
         ip: ir.InsertionPoint | None = None,
     ) -> None:
-        rt_dtypes = [rt_data.arr_type for rt_data in self._rt_data]
+        sequence_items = getattr(self, "_sequence_items", None)
+        if sequence_items is None:
+            sequence_items = self._rt_data
+
+        rt_dtypes = []
+        for item in sequence_items:
+            if isinstance(item, RuntimeData):
+                rt_dtypes.append(item.arr_type)
+            elif isinstance(item, RuntimeScalar):
+                rt_dtypes.append(item.mlir_type)
+            else:
+                rt_dtypes.append(item)
 
         task_group_actions = defaultdict(list)
+        bd_id_allocator = _RuntimeBdIdAllocator()
 
         @runtime_sequence(*rt_dtypes)
         def sequence(*args):
@@ -330,62 +434,62 @@ def sequence(*args):
                     routing="single",
                 )
 
-            for rt_data, rt_data_val in zip(self._rt_data, args):
-                rt_data.op = rt_data_val
+            for item, arg_val in zip(sequence_items, args):
+                item.op = arg_val
 
             def finish_task_group(tg, task_group_actions):
                 actions = task_group_actions[tg]
 
-                # We want to keep order, EXCEPT do waits before frees
-                wait_tasks = [
-                    (fn, args) for (fn, args) in actions if fn == dma_await_task
-                ]
-                free_tasks = [
-                    (fn, args) for (fn, args) in actions if fn == dma_free_task
-                ]
+                wait_tasks = [task for (kind, task) in actions if kind == "wait"]
+                free_tasks = [task for (kind, task) in actions if kind == "free"]
 
-                # Check for anything known -- this shouldn't happen, but we'll catch it gracefully anyways.
                 if len(wait_tasks) + len(free_tasks) != len(actions):
                     unknown_actions = [
-                        (fn, args)
-                        for (fn, args) in actions
-                        if fn != dma_await_task and fn != dma_free_task
+                        (kind, task)
+                        for (kind, task) in actions
+                        if kind not in ("wait", "free")
                     ]
                     raise Exception(
                         f"Unknown action type detected: {','.join(unknown_actions)}"
                     )
 
-                for fn, args in wait_tasks + free_tasks:
-                    fn(*args)
+                for task in wait_tasks:
+                    task.emit_wait()
+                    if task.uses_direct_npu_dma():
+                        bd_id_allocator.free(task.bd_allocation_key, task.bd_id)
+                for task in free_tasks:
+                    task.emit_free()
+                    if task.uses_direct_npu_dma():
+                        bd_id_allocator.free(task.bd_allocation_key, task.bd_id)
                 task_group_actions[tg] = None
 
             default_task_group = self.task_group()
             default_tasks = False
             task_group_tasks = False
             for task in self._tasks:
-
-                task.resolve()
                 if isinstance(task, DMATask):
+                    if task.uses_direct_npu_dma():
+                        task.resolve(
+                            bd_id=bd_id_allocator.allocate(task.bd_allocation_key)
+                        )
+                    else:
+                        task.resolve()
                     if task.task_group:
                         task_group_tasks = True
                         current_task_group = task.task_group
                     else:
                         default_tasks = True
                         current_task_group = default_task_group
-                    if task.will_wait():
-                        task_group_actions[current_task_group].append(
-                            (dma_await_task, [task.task])
-                        )
-                    else:
-                        task_group_actions[current_task_group].append(
-                            (dma_free_task, [task.task])
-                        )
+                    action_kind = "wait" if task.will_wait() else "free"
+                    task_group_actions[current_task_group].append((action_kind, task))
+                else:
+                    task.resolve()
                 if isinstance(task, FinishTaskGroupTask):
                     finish_task_group(task.task_group, task_group_actions)
 
             if self._strict_task_groups and default_tasks and task_group_tasks:
                 raise Exception(
-                    f"Mixing explicit task groups and the default task group is prohibitted. "
+                    f"Mixing explicit task groups and the default task group is prohibited. "
                     f"Please assign all default tasks ({task_group_actions[default_task_group]}) to a task group."
                 )
 
diff --git a/python/iron/runtime/task.py b/python/iron/runtime/task.py
index d3a1c57fec4..57988878d50 100644
--- a/python/iron/runtime/task.py
+++ b/python/iron/runtime/task.py
@@ -8,10 +8,12 @@
 from typing import Callable
 
 from ... import ir  # type: ignore
+from ...dialects.aiex import npu_rtp_write
 
 from ..buffer import Buffer
 from ..resolvable import Resolvable
 from ..worker import Worker
+from .data import RuntimeScalar
 from .taskgroup import RuntimeTaskGroup
 
 
@@ -95,3 +97,28 @@ def resolve(
         for arg in self._args:
             InlineOpRuntimeTask._resolve_buffers(arg, loc, ip)
         self._fn(*self._args)
+
+
+class RtpWriteTask(RuntimeTask):
+    """A RuntimeTask that writes a value to a Runtime Parameter (RTP) buffer."""
+
+    def __init__(self, buf_name: str, index: int, value):
+        """Construct an RtpWriteTask.
+
+        Args:
+            buf_name (str): The name of the RTP buffer.
+            index (int): The index within the RTP buffer to write.
+            value: The value to write (int or RuntimeScalar op).
+        """
+        self._buf_name = buf_name
+        self._index = index
+        self._value = value
+        RuntimeTask.__init__(self, None)
+
+    def resolve(
+        self,
+        loc: ir.Location | None = None,
+        ip: ir.InsertionPoint | None = None,
+    ) -> None:
+        val = self._value.op if isinstance(self._value, RuntimeScalar) else self._value
+        npu_rtp_write(self._buf_name, self._index, val)
diff --git a/python/iron/worker.py b/python/iron/worker.py
index c946b9eb81c..dcb892dccd8 100644
--- a/python/iron/worker.py
+++ b/python/iron/worker.py
@@ -40,6 +40,7 @@ def __init__(
         allocation_scheme: str = None,
         trace: int = None,
         trace_events: list = None,
+        dynamic_objfifo_lowering: bool = None,
     ):
         """Construct a Worker
 
@@ -53,6 +54,7 @@ def __init__(
                 Will override any allocation scheme set on the tile.
             trace (int, optional): If >0, enable tracing for this worker.
             trace_events (list | None, optional): Custom list of trace events for this worker. Defaults to None.
+            dynamic_objfifo_lowering (bool, optional): If True, enables dynamic ObjectFifo lowering for runtime-parameterized loop bounds. Defaults to None.
 
         Raises:
             ValueError: Parameters are validated.
@@ -66,6 +68,7 @@ def __init__(
         self._tile = tile
         self._while_true = while_true
         self.stack_size = stack_size
+        self.dynamic_objfifo_lowering = dynamic_objfifo_lowering
         self.allocation_scheme = allocation_scheme
         if allocation_scheme:
             self._tile.allocation_scheme = allocation_scheme
@@ -148,7 +151,11 @@ def resolve(
             l = lock(my_tile)
             barrier._add_worker_lock(l)
 
-        @core(my_tile, stack_size=self.stack_size)
+        @core(
+            my_tile,
+            stack_size=self.stack_size,
+            dynamic_objfifo_lowering=self.dynamic_objfifo_lowering,
+        )
         def core_body():
             for _ in range_(sys.maxsize) if self._while_true else range(1):
                 self.core_fn(*self.fn_args)
diff --git a/python/utils/test.py b/python/utils/test.py
index d91b83d1894..1e7d34705c6 100644
--- a/python/utils/test.py
+++ b/python/utils/test.py
@@ -15,6 +15,7 @@
     * Calls create_default_argparser and returns the parsed results
     * Useful if you don't need additional custom args
 """
+
 import argparse
 from aie.utils import TraceConfig, NPUKernel
 
diff --git a/python/utils/trace/events/__init__.py b/python/utils/trace/events/__init__.py
index dd7a4c8d7a7..b2c7d327e0e 100644
--- a/python/utils/trace/events/__init__.py
+++ b/python/utils/trace/events/__init__.py
@@ -11,6 +11,7 @@
 - aie2: AIE2/AIEML architecture events
 - aie2p: AIE2P architecture events
 """
+
 from enum import IntEnum
 import typing
 
diff --git a/runtime_lib/CMakeLists.txt b/runtime_lib/CMakeLists.txt
index 78f6c963b8b..29bd21ad3b6 100644
--- a/runtime_lib/CMakeLists.txt
+++ b/runtime_lib/CMakeLists.txt
@@ -133,6 +133,20 @@ foreach(target ${AIE_RUNTIME_TARGETS})
       TEST_BEFORE_INSTALL true
       TEST_EXCLUDE_FROM_MAIN true
     )
+  else()
+    # Even when the full test_lib ExternalProject is disabled (e.g. wheels
+    # build without VITIS or LibXAIE), the header-only payload that the
+    # programming examples rely on (cxxopts.hpp, test_utils.h,
+    # xrt_test_wrapper.h) must still be installed so host-side test
+    # executables can compile against the install tree.
+    if (NOT WIN32 OR ${target} STREQUAL "x86_64")
+      install(FILES
+          test_lib/cxxopts.hpp
+          test_lib/test_utils.h
+          test_lib/xrt_test_wrapper.h
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/runtime_lib/${target}/test_lib/include
+      )
+    endif()
   endif()
 endforeach()
 
diff --git a/runtime_lib/test_lib/xrt_test_wrapper.h b/runtime_lib/test_lib/xrt_test_wrapper.h
index b49d77b9395..044bff3a4e5 100644
--- a/runtime_lib/test_lib/xrt_test_wrapper.h
+++ b/runtime_lib/test_lib/xrt_test_wrapper.h
@@ -7,6 +7,7 @@
 #include "xrt/xrt_kernel.h"
 
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <sstream>
 
@@ -20,6 +21,8 @@ struct args {
   std::string xclbin;
   std::string kernel;
   std::string trace_file;
+  std::function<std::vector<uint32_t>()> generate_instr; // optional
+  int dynamic_size = 0; // runtime transfer size in bytes (0 = use compiled-in)
 };
 
 struct args parse_args(int argc, const char *argv[]) {
@@ -29,6 +32,9 @@ struct args parse_args(int argc, const char *argv[]) {
   cxxopts::Options options("XRT Test Wrapper");
   cxxopts::ParseResult vm;
   test_utils::add_default_options(options);
+  options.add_options()("dynamic-size",
+                        "Runtime transfer size in bytes (dynamic TXN only)",
+                        cxxopts::value<int>()->default_value("0"));
 
   struct args myargs;
 
@@ -38,10 +44,12 @@ struct args parse_args(int argc, const char *argv[]) {
   myargs.n_iterations = vm["iters"].as<int>();
   myargs.n_warmup_iterations = vm["warmup"].as<int>();
   myargs.trace_size = vm["trace_sz"].as<int>();
-  myargs.instr = vm["instr"].as<std::string>();
+  if (vm.count("instr"))
+    myargs.instr = vm["instr"].as<std::string>();
   myargs.xclbin = vm["xclbin"].as<std::string>();
   myargs.kernel = vm["kernel"].as<std::string>();
   myargs.trace_file = vm["trace_file"].as<std::string>();
+  myargs.dynamic_size = vm["dynamic-size"].as<int>();
 
   return myargs;
 }
@@ -80,7 +88,11 @@ int setup_and_run_aie(int IN1_VOLUME, int IN2_VOLUME, int OUT_VOLUME,
   srand(time(NULL));
 
   // Load instruction sequence
-  std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
+  std::vector<uint32_t> instr_v;
+  if (myargs.generate_instr)
+    instr_v = myargs.generate_instr();
+  else
+    instr_v = test_utils::load_instr_binary(myargs.instr);
   if (myargs.verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
@@ -298,7 +310,11 @@ int setup_and_run_aie(int IN1_VOLUME, int OUT_VOLUME, struct args myargs,
   srand(time(NULL));
 
   // Load instruction sequence
-  std::vector<uint32_t> instr_v = test_utils::load_instr_binary(myargs.instr);
+  std::vector<uint32_t> instr_v;
+  if (myargs.generate_instr)
+    instr_v = myargs.generate_instr();
+  else
+    instr_v = test_utils::load_instr_binary(myargs.instr);
   if (myargs.verbosity >= 1)
     std::cout << "Sequence instr count: " << instr_v.size() << "\n";
 
diff --git a/test/Conversion/AIEXToEmitC/basic_txn_cpp.mlir b/test/Conversion/AIEXToEmitC/basic_txn_cpp.mlir
new file mode 100644
index 00000000000..dfbfb31cba1
--- /dev/null
+++ b/test/Conversion/AIEXToEmitC/basic_txn_cpp.mlir
@@ -0,0 +1,51 @@
+//===- basic_txn_cpp.mlir - Basic EmitC TXN generation test ------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests the end-to-end C++ TXN generation pipeline (aie-translate
+// --aie-generate-txn-cpp) with a runtime_sequence containing static
+// npu.write32, npu.sync, and npu.address_patch operations.
+//
+// Verifies the generated C++ includes the expected function structure,
+// TXN encoding calls, and op_count tracking.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-translate --aie-generate-txn-cpp %s | FileCheck %s
+
+// CHECK: #include "aie/Runtime/TxnEncoding.h"
+// CHECK: #include <cstdint>
+// CHECK: #include <vector>
+
+// CHECK-LABEL: generate_txn_sequence
+// CHECK: std::vector<uint32_t> txn;
+// CHECK: aie_runtime::txn_init(txn);
+// CHECK: uint32_t op_count = 0;
+
+// CHECK: aie_runtime::txn_append_write32(txn,
+// CHECK: op_count++;
+
+// CHECK: aie_runtime::txn_append_sync(txn,
+// CHECK: op_count++;
+
+// CHECK: aie_runtime::txn_append_address_patch(txn,
+// CHECK: op_count++;
+
+// CHECK: aie_runtime::txn_prepend_header(txn, op_count,
+// CHECK: return txn;
+
+module {
+  aie.device(npu2) {
+    aie.runtime_sequence(%buf : memref<16xi32>) {
+      aiex.npu.write32 {address = 196612 : ui32, value = 42 : ui32}
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.address_patch {addr = 196616 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
+    }
+  }
+}
diff --git a/test/Conversion/AIEXToEmitC/dynamic_values.mlir b/test/Conversion/AIEXToEmitC/dynamic_values.mlir
new file mode 100644
index 00000000000..49dd93028ed
--- /dev/null
+++ b/test/Conversion/AIEXToEmitC/dynamic_values.mlir
@@ -0,0 +1,59 @@
+//===- dynamic_values.mlir - Dynamic SSA values in EmitC TXN -----*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests C++ TXN generation with dynamic SSA operands: runtime_sequence
+// parameters flow through to dynamic npu.write32, npu.sync, and
+// npu.address_patch operations. Verifies the generated C++ uses function
+// parameters and (uint32_t) casts instead of only constants.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-translate --aie-generate-txn-cpp %s | FileCheck %s
+
+// The function should accept the i32 parameter (memref args are dropped).
+// CHECK-LABEL: generate_txn_sequence
+// CHECK-SAME: int32_t
+
+// CHECK: std::vector<uint32_t> txn;
+// CHECK: aie_runtime::txn_init(txn);
+// CHECK: uint32_t op_count = 0;
+
+// Dynamic write32: address and value come from SSA operands via uint32_t cast.
+// CHECK: (uint32_t)
+// CHECK: aie_runtime::txn_append_write32(txn,
+// CHECK: op_count++;
+
+// Dynamic sync: all parameters from SSA.
+// CHECK: aie_runtime::txn_append_sync(txn,
+// CHECK: op_count++;
+
+// Dynamic address_patch: dyn_arg_plus from SSA.
+// CHECK: aie_runtime::txn_append_address_patch(txn,
+// CHECK: op_count++;
+
+// CHECK: aie_runtime::txn_prepend_header(txn, op_count,
+// CHECK: return txn;
+
+module {
+  aie.device(npu2) {
+    aie.runtime_sequence(%buf : memref<16xi32>, %param : i32) {
+      %c0_i32 = arith.constant 0 : i32
+
+      // Dynamic write32 with SSA address and value
+      aiex.npu.write32(%param, %param) {address = 0 : ui32, value = 0 : ui32} : i32, i32
+
+      // Dynamic sync with SSA parameters
+      aiex.npu.sync(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %param, %param) {channel = 0 : i32, column = 0 : i32, column_num = 0 : i32, direction = 0 : i32, row = 0 : i32, row_num = 0 : i32} : i32, i32, i32, i32, i32, i32
+
+      // Dynamic address_patch with SSA arg_plus
+      aiex.npu.address_patch(%param : i32) {addr = 100 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
+    }
+  }
+}
diff --git a/test/Conversion/AIEXToEmitC/unsupported_ops.mlir b/test/Conversion/AIEXToEmitC/unsupported_ops.mlir
new file mode 100644
index 00000000000..c93494987ab
--- /dev/null
+++ b/test/Conversion/AIEXToEmitC/unsupported_ops.mlir
@@ -0,0 +1,26 @@
+//===- unsupported_ops.mlir - Unsupported op error test ----------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Negative test: verifies that the ConvertAIEXToEmitC pass emits an error
+// when encountering unsupported ops (npu.push_queue) inside a runtime_sequence.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --convert-aiex-to-emitc %s 2>&1 | FileCheck %s
+
+// CHECK: not supported in dynamic TXN C++ generation
+
+module {
+  aie.device(npu2) {
+    aie.runtime_sequence() {
+      aiex.npu.push_queue(0, 0, S2MM : 0) {issue_token = false, repeat_count = 0 : i32, bd_id = 0 : i32}
+    }
+  }
+}
diff --git a/test/Conversion/DmaToNpu/dma_to_npu_dynamic_strides.mlir b/test/Conversion/DmaToNpu/dma_to_npu_dynamic_strides.mlir
new file mode 100644
index 00000000000..34715602495
--- /dev/null
+++ b/test/Conversion/DmaToNpu/dma_to_npu_dynamic_strides.mlir
@@ -0,0 +1,56 @@
+//===- dma_to_npu_dynamic_strides.mlir - Dynamic sizes/strides ---*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Tests that the DMA-to-NPU lowering correctly handles dynamic (SSA) sizes
+// and strides in npu.dma_memcpy_nd operations. The dynamic path emits BD
+// words via npu.write32 with dyn_address/dyn_value instead of blockwrite.
+
+// RUN: aie-opt --split-input-file -aie-dma-to-npu %s | FileCheck %s
+
+// CHECK-LABEL: module
+// Dynamic sizes produce npu.write32 ops (not blockwrite)
+// CHECK: aiex.npu.write32
+// CHECK: aiex.npu.address_patch
+// CHECK-SAME: arg_idx = 0 : i32
+module  {
+  aie.device(npu2) {
+    aie.runtime_sequence(%arg0: memref<16384xbf16>, %arg1: i32) {
+      // Dynamic size in dim[0]: %arg1 is an SSA value
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c32 = arith.constant 32 : i64
+      %dim0 = arith.extui %arg1 : i32 to i64
+      aiex.npu.dma_memcpy_nd (%arg0[%c0, %c0, %c0, %c0][%dim0, %c1, %c32, %c32][%c0, %c32, %c32, %c1]) { metadata = @toMem, id = 1 : i64 } : memref<16384xbf16>
+    }
+    %tile_0_0 = aie.tile(0, 0)
+    aie.shim_dma_allocation @toMem (%tile_0_0, S2MM, 0)
+  }
+}
+
+// -----
+
+// Test bf16 d0_size computation: d0_size = inSize0 * elemWidth / addrGran
+// For bf16: 32 * 16 / 32 = 16 (NOT 32 * (16/32) = 0)
+// CHECK-LABEL: module
+// CHECK: aiex.npu.write32
+module  {
+  aie.device(npu2) {
+    aie.runtime_sequence(%arg0: memref<16384xbf16>, %arg1: i32) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c32 = arith.constant 32 : i64
+      %dim = arith.extui %arg1 : i32 to i64
+      // bf16 with dynamic outer dim
+      aiex.npu.dma_memcpy_nd (%arg0[%c0, %c0, %c0, %c0][%dim, %c1, %c32, %c32][%c0, %c32, %c32, %c1]) { metadata = @inA, id = 2 : i64 } : memref<16384xbf16>
+    }
+    %tile_0_0 = aie.tile(0, 0)
+    aie.shim_dma_allocation @inA (%tile_0_0, S2MM, 0)
+  }
+}
diff --git a/test/aiecc/Inputs/static_vs_dynamic_txn/compare_main.cpp b/test/aiecc/Inputs/static_vs_dynamic_txn/compare_main.cpp
new file mode 100644
index 00000000000..27157bad4c3
--- /dev/null
+++ b/test/aiecc/Inputs/static_vs_dynamic_txn/compare_main.cpp
@@ -0,0 +1,58 @@
+//===- compare_main.cpp ----------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Harness for static-vs-dynamic TXN equivalence tests.  static_txn() and
+// dynamic_txn(n) live in separate translation units (because both generated
+// headers define a `generate_txn_sequence` symbol at namespace scope and
+// cannot be #included into the same TU).  This file just compares the two
+// `std::vector<uint32_t>` streams and reports the first divergence.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+std::vector<uint32_t> static_txn();
+std::vector<uint32_t> dynamic_txn(int32_t n);
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    std::fprintf(stderr, "usage: %s <n>\n", argv[0]);
+    return 2;
+  }
+  int32_t n = std::atoi(argv[1]);
+
+  std::vector<uint32_t> a = static_txn();
+  std::vector<uint32_t> b = dynamic_txn(n);
+
+  if (a == b) {
+    std::printf("equivalent: %zu words (n=%d)\n", a.size(), n);
+    return 0;
+  }
+
+  std::fprintf(stderr,
+               "TXN streams DIFFER: static=%zu words, dynamic=%zu words "
+               "(n=%d)\n",
+               a.size(), b.size(), n);
+
+  size_t lim = std::min(a.size(), b.size());
+  for (size_t i = 0; i < lim; ++i) {
+    if (a[i] != b[i]) {
+      std::fprintf(stderr,
+                   "  first diff at word %zu: static=0x%08x dynamic=0x%08x\n",
+                   i, a[i], b[i]);
+      break;
+    }
+  }
+  return 1;
+}
diff --git a/test/aiecc/Inputs/static_vs_dynamic_txn/gen_dynamic.cpp b/test/aiecc/Inputs/static_vs_dynamic_txn/gen_dynamic.cpp
new file mode 100644
index 00000000000..f8d2c2fc611
--- /dev/null
+++ b/test/aiecc/Inputs/static_vs_dynamic_txn/gen_dynamic.cpp
@@ -0,0 +1,24 @@
+//===- gen_dynamic.cpp -----------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wraps the dynamic `generate_txn_sequence(int32_t)` function emitted by
+// aiecc into a uniquely-named symbol so it can be linked alongside one or
+// more static wrappers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "dynamic_txn.h"
+
+#include <cstdint>
+#include <vector>
+
+std::vector<uint32_t> dynamic_txn(int32_t n) {
+  return generate_txn_sequence(n);
+}
diff --git a/test/aiecc/Inputs/static_vs_dynamic_txn/gen_static.cpp b/test/aiecc/Inputs/static_vs_dynamic_txn/gen_static.cpp
new file mode 100644
index 00000000000..f53346f3303
--- /dev/null
+++ b/test/aiecc/Inputs/static_vs_dynamic_txn/gen_static.cpp
@@ -0,0 +1,25 @@
+//===- gen_static.cpp ------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wraps the static `generate_txn_sequence()` function emitted by aiecc into
+// a uniquely-named symbol so it can be linked alongside the dynamic one.
+//
+// `STATIC_HEADER` is a -D quoted-include name set from the lit RUN line and
+// `STATIC_NAME` is the wrapper symbol name (e.g. static_txn or
+// static_txn_8192).
+//
+//===----------------------------------------------------------------------===//
+
+#include STATIC_HEADER
+
+#include <cstdint>
+#include <vector>
+
+std::vector<uint32_t> STATIC_NAME() { return generate_txn_sequence(); }
diff --git a/test/aiecc/Inputs/static_vs_dynamic_txn/passthrough_dynamic.mlir b/test/aiecc/Inputs/static_vs_dynamic_txn/passthrough_dynamic.mlir
new file mode 100644
index 00000000000..10eda277e42
--- /dev/null
+++ b/test/aiecc/Inputs/static_vs_dynamic_txn/passthrough_dynamic.mlir
@@ -0,0 +1,120 @@
+//===- passthrough_dynamic.mlir ----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Dynamic counterpart of `cpp_static_vs_dynamic_txn.mlir`.
+//
+// Mirrors the static module structurally; the only difference is that the
+// runtime_sequence takes an additional %n : i32 argument that is forwarded
+// to the `aiex.npu.rtp_write` operations.
+//
+// All DMA descriptor fields (sizes, strides, offsets) are kept as
+// constants on both sides so the BD-lowering pass folds them into a
+// `blockwrite` TXN op on each side; otherwise, threading %n through the
+// BD would force the compiler to emit per-register `write32` ops on the
+// dynamic side and the two streams would differ structurally even when
+// they program the same hardware state.
+//
+// Calling generate_txn_sequence(4096) on this header must produce the
+// exact same word stream that generate_txn_sequence() in the static
+// header produces.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu2) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
+
+    %rtp = aie.buffer(%tile_0_2) {sym_name = "rtp"} : memref<16xi32>
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c64 = arith.constant 64 : index
+      %c1_i32 = arith.constant 1 : i32
+
+      %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<64xi32>>
+      %elem_in = aie.objectfifo.subview.access %subview_in[0] : !aie.objectfifosubview<memref<64xi32>> -> memref<64xi32>
+
+      %subview_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<64xi32>>
+      %elem_out = aie.objectfifo.subview.access %subview_out[0] : !aie.objectfifosubview<memref<64xi32>> -> memref<64xi32>
+
+      scf.for %i = %c0 to %c64 step %c1 {
+        %val = memref.load %elem_in[%i] : memref<64xi32>
+        %result = arith.addi %val, %c1_i32 : i32
+        memref.store %result, %elem_out[%i] : memref<64xi32>
+      }
+
+      aie.objectfifo.release @of_in(Consume, 1)
+      aie.objectfifo.release @of_out(Produce, 1)
+      aie.end
+    } {link_with = ""}
+
+    aie.runtime_sequence(%in : memref<8192xi32>, %out : memref<8192xi32>, %n : i32) {
+      // %n drives the first rtp_write directly and a derived value drives
+      // the second.  Both go through txn_append_write32, where the encoded
+      // word equals the unsigned cast of the runtime value — bit-identical
+      // to the static version when %n == the static constant.
+      %c1_i32 = arith.constant 1 : i32
+      %n_plus_1 = arith.addi %n, %c1_i32 : i32
+
+      aiex.npu.rtp_write(@rtp, 0 : ui32, %n)        : i32
+      aiex.npu.rtp_write(@rtp, 4 : ui32, %n_plus_1) : i32
+
+      aiex.npu.write32 {address = 196612 : ui32, value = 42 : ui32}
+
+      // First 4-D pattern: sizes=[2,4,8,64], strides=[2048,512,64,1] over %out.
+      // dma_memcpy_nd offsets/sizes/strides are i64 SSA values (TODO: i32).
+      %c0   = arith.constant    0 : i64
+      %c1   = arith.constant    1 : i64
+      %c2   = arith.constant    2 : i64
+      %c4   = arith.constant    4 : i64
+      %c8   = arith.constant    8 : i64
+      %c16  = arith.constant   16 : i64
+      %c32  = arith.constant   32 : i64
+      %c64  = arith.constant   64 : i64
+      %c128 = arith.constant  128 : i64
+      %c256 = arith.constant  256 : i64
+      %c512 = arith.constant  512 : i64
+      %c2048 = arith.constant 2048 : i64
+      %c4096 = arith.constant 4096 : i64
+
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0]
+                                 [%c2,%c4,%c8,%c64]
+                                 [%c2048,%c512,%c64,%c1])
+        {metadata = @of_out, id = 1 : i64} : memref<8192xi32>
+
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0]
+                                [%c2,%c4,%c8,%c64]
+                                [%c2048,%c512,%c64,%c1])
+        {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<8192xi32>
+
+      aiex.npu.dma_wait {symbol = @of_out}
+
+      // Second 4-D pattern with a different shape/stride mix: sizes=[1,8,16,32],
+      // strides=[4096,512,32,1].  This exercises a fresh BD blockwrite + the
+      // address_patch for a different arg index, touching more TXN words.
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0]
+                                 [%c1,%c8,%c16,%c32]
+                                 [%c4096,%c512,%c32,%c1])
+        {metadata = @of_out, id = 1 : i64} : memref<8192xi32>
+
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0]
+                                [%c1,%c8,%c16,%c32]
+                                [%c4096,%c512,%c32,%c1])
+        {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<8192xi32>
+
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/aiecc/cpp_dynamic_txn.mlir b/test/aiecc/cpp_dynamic_txn.mlir
new file mode 100644
index 00000000000..8487baabcdc
--- /dev/null
+++ b/test/aiecc/cpp_dynamic_txn.mlir
@@ -0,0 +1,116 @@
+//===- cpp_dynamic_txn.mlir - Dynamic C++ TXN generation --------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests --aie-generate-txn-cpp with a runtime_sequence containing:
+// - SSA parameters (i32)
+// - scf.for with iter_args
+// - scf.if with results
+// - Dynamic npu.dma_memcpy_nd (SSA sizes/strides)
+// - Dynamic npu.rtp_write (SSA value)
+// - arith ops (divui, muli, minsi, extui, trunci, cmpi, select)
+//
+// aiecc explicitly keeps aie.runtime_sequence legal during the module-level
+// SCF→CF conversion, so runtime-sequence SCF is preserved while core body SCF
+// still lowers for LLVM code generation.
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: peano
+
+// RUN: aiecc --no-xchesscc --no-xbridge --aie-generate-txn-cpp \
+// RUN:   --txn-cpp-name=%t.h --no-compile --no-link --verbose %s 2>&1 | FileCheck %s
+
+// CHECK: Compilation completed successfully
+
+// Also test unified compilation (XCLBIN + TXN from same MLIR):
+// RUN: aiecc --no-xchesscc --no-xbridge --peano %PEANO_INSTALL_DIR \
+// RUN:   --aie-generate-xclbin --xclbin-name=%t.xclbin \
+// RUN:   --aie-generate-txn-cpp --txn-cpp-name=%t_unified.h \
+// RUN:   --verbose %s 2>&1 | FileCheck %s --check-prefix=UNIFIED
+
+// UNIFIED: Compilation completed successfully
+
+module {
+  aie.device(npu2) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    %rtp = aie.buffer(%tile_0_2) {sym_name = "rtp"} : memref<16xi32>
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c16 = arith.constant 16 : index
+
+      %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_in = aie.objectfifo.subview.access %subview_in[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      %subview_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_out = aie.objectfifo.subview.access %subview_out[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      scf.for %i = %c0 to %c16 step %c1 {
+        %val = memref.load %elem_in[%i] : memref<16xi32>
+        %c1_i32 = arith.constant 1 : i32
+        %result = arith.addi %val, %c1_i32 : i32
+        memref.store %result, %elem_out[%i] : memref<16xi32>
+      }
+
+      aie.objectfifo.release @of_in(Consume, 1)
+      aie.objectfifo.release @of_out(Produce, 1)
+      aie.end
+    } {link_with = ""}
+
+    // Runtime sequence with SSA parameters, SCF loops, and dynamic DMA
+    aie.runtime_sequence(%in : memref<16xi32>, %out : memref<16xi32>, %n : i32) {
+      %c0_i32 = arith.constant 0 : i32
+      %c1_i32 = arith.constant 1 : i32
+      %c16_i32 = arith.constant 16 : i32
+
+      // Dynamic RTP write
+      aiex.npu.rtp_write(@rtp, 0 : ui32, %n) : i32
+
+      // Derived value
+      %n_div_16 = arith.divui %n, %c16_i32 : i32
+
+      // scf.for with iter_args
+      %c0_idx = arith.index_cast %c0_i32 : i32 to index
+      %c1_idx = arith.index_cast %c1_i32 : i32 to index
+      %n_idx = arith.index_cast %n_div_16 : i32 to index
+
+      %result = scf.for %i = %c0_idx to %n_idx step %c1_idx
+          iter_args(%acc = %c0_i32) -> (i32) {
+        %i_i32 = arith.index_cast %i : index to i32
+
+        // scf.if with results
+        %cmp = arith.cmpi sgt, %i_i32, %c0_i32 : i32
+        %val = scf.if %cmp -> (i32) {
+          scf.yield %c1_i32 : i32
+        } else {
+          scf.yield %c0_i32 : i32
+        }
+
+        // Dynamic DMA with SSA sizes
+        %c0 = arith.constant 0 : i64
+        %c1 = arith.constant 1 : i64
+        %c16 = arith.constant 16 : i64
+        %dim = arith.extui %n_div_16 : i32 to i64
+        aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%dim,%c1,%c1,%c16][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<16xi32>
+
+        %new_acc = arith.addi %acc, %val : i32
+        scf.yield %new_acc : i32
+      }
+
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+  }
+}
diff --git a/test/aiecc/cpp_static_vs_dynamic_txn.mlir b/test/aiecc/cpp_static_vs_dynamic_txn.mlir
new file mode 100644
index 00000000000..9ea9fcb5df9
--- /dev/null
+++ b/test/aiecc/cpp_static_vs_dynamic_txn.mlir
@@ -0,0 +1,163 @@
+//===- cpp_static_vs_dynamic_txn.mlir -----------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// End-to-end equivalence check: the TXN word stream produced from a static
+// `aie.runtime_sequence` (this file, hardcoded N=4096) must be bit-identical
+// to the stream produced from a runtime-parameterized version
+// (Inputs/static_vs_dynamic_txn/passthrough_dynamic.mlir) when the dynamic
+// version is invoked with N=4096.
+//
+// All DMA descriptor fields are kept as constants on both sides so the BD
+// pass folds them into `blockwrite` ops; only `aiex.npu.rtp_write`
+// operations are wired to the runtime %n parameter on the dynamic side.
+// This is the most a runtime parameter can affect on the dynamic side
+// without breaking blockwrite folding of the BDs and producing a
+// structurally different (per-register `write32`) TXN stream.
+//
+// Two 4-D DMA patterns and two `rtp_write`s (one direct, one derived) are
+// included to exercise more TXN op encodings and word territory than a
+// single flat transfer would.
+//
+// Why a host-compiled comparison rather than a pure FileCheck?  aie-translate
+// has no flag to substitute SSA i32 arguments of a runtime_sequence at
+// translate time, so a `--aie-npu-to-binary` round-trip on the dynamic MLIR
+// cannot resolve %n to a constant.  The comparison therefore happens in
+// compiled C++.  Each generated header defines a `generate_txn_sequence`
+// symbol, so the two headers are compiled in separate translation units
+// (`gen_static.cpp`, `gen_dynamic.cpp`) and exposed under unique wrapper
+// names that the harness `compare_main.cpp` calls.
+//
+// To add a new size N to this test:
+//   1. Drop a new static MLIR (e.g. passthrough_static_NEW.mlir) into
+//      Inputs/static_vs_dynamic_txn/, identical to this file but with the
+//      `arith.constant N : i32` value updated.
+//   2. Add three RUN lines: one to generate the static header, one to
+//      compile gen_static.cpp with -DSTATIC_HEADER and -DSTATIC_NAME, one
+//      to link & invoke compare_main.cpp with -Dstatic_txn=...  passing N.
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: peano
+
+// RUN: rm -rf %t.d && mkdir -p %t.d
+
+// Generate the dynamic TXN header.
+// RUN: aiecc --no-xchesscc --no-xbridge --aie-generate-txn-cpp \
+// RUN:   --txn-cpp-name=%t.d/dynamic_txn.h --no-compile --no-link \
+// RUN:   %S/Inputs/static_vs_dynamic_txn/passthrough_dynamic.mlir
+
+// Generate the static TXN header for N=4096 (this file).
+// RUN: aiecc --no-xchesscc --no-xbridge --aie-generate-txn-cpp \
+// RUN:   --txn-cpp-name=%t.d/static_txn_4096.h --no-compile --no-link %s
+
+// Compile harness wrappers.  gen_static.cpp uses -D macros so the same
+// source file can be reused for additional N values.
+// RUN: clang++ -std=c++17 -O0 -I%t.d -I%S/../../include \
+// RUN:   -DSTATIC_HEADER='"static_txn_4096.h"' -DSTATIC_NAME=static_txn_4096 \
+// RUN:   -c %S/Inputs/static_vs_dynamic_txn/gen_static.cpp \
+// RUN:   -o %t.d/gen_static_4096.o
+// RUN: clang++ -std=c++17 -O0 -I%t.d -I%S/../../include \
+// RUN:   -c %S/Inputs/static_vs_dynamic_txn/gen_dynamic.cpp \
+// RUN:   -o %t.d/gen_dynamic.o
+
+// compare_main.cpp's prototypes use the canonical name `static_txn`; the
+// -D rename selects which generated wrapper resolves it for this run.
+
+// --- N=4096 comparison ---
+// RUN: clang++ -std=c++17 -O0 -Dstatic_txn=static_txn_4096 \
+// RUN:   %S/Inputs/static_vs_dynamic_txn/compare_main.cpp \
+// RUN:   %t.d/gen_static_4096.o %t.d/gen_dynamic.o \
+// RUN:   -o %t.compare_4096.exe
+// RUN: %t.compare_4096.exe 4096
+
+module {
+  aie.device(npu2) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
+
+    %rtp = aie.buffer(%tile_0_2) {sym_name = "rtp"} : memref<16xi32>
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c64 = arith.constant 64 : index
+      %c1_i32 = arith.constant 1 : i32
+
+      %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<64xi32>>
+      %elem_in = aie.objectfifo.subview.access %subview_in[0] : !aie.objectfifosubview<memref<64xi32>> -> memref<64xi32>
+
+      %subview_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<64xi32>>
+      %elem_out = aie.objectfifo.subview.access %subview_out[0] : !aie.objectfifosubview<memref<64xi32>> -> memref<64xi32>
+
+      scf.for %i = %c0 to %c64 step %c1 {
+        %val = memref.load %elem_in[%i] : memref<64xi32>
+        %result = arith.addi %val, %c1_i32 : i32
+        memref.store %result, %elem_out[%i] : memref<64xi32>
+      }
+
+      aie.objectfifo.release @of_in(Consume, 1)
+      aie.objectfifo.release @of_out(Produce, 1)
+      aie.end
+    } {link_with = ""}
+
+    aie.runtime_sequence(%in : memref<8192xi32>, %out : memref<8192xi32>) {
+      // Static N=4096; second rtp_write value is the derived %n+1 = 4097.
+      %c4096_i32 = arith.constant 4096 : i32
+      %c4097_i32 = arith.constant 4097 : i32
+      aiex.npu.rtp_write(@rtp, 0 : ui32, %c4096_i32) : i32
+      aiex.npu.rtp_write(@rtp, 4 : ui32, %c4097_i32) : i32
+
+      aiex.npu.write32 {address = 196612 : ui32, value = 42 : ui32}
+
+      %c0   = arith.constant    0 : i64
+      %c1   = arith.constant    1 : i64
+      %c2   = arith.constant    2 : i64
+      %c4   = arith.constant    4 : i64
+      %c8   = arith.constant    8 : i64
+      %c16  = arith.constant   16 : i64
+      %c32  = arith.constant   32 : i64
+      %c64  = arith.constant   64 : i64
+      %c128 = arith.constant  128 : i64
+      %c256 = arith.constant  256 : i64
+      %c512 = arith.constant  512 : i64
+      %c2048 = arith.constant 2048 : i64
+      %c4096 = arith.constant 4096 : i64
+
+      // First 4-D pattern: sizes=[2,4,8,64], strides=[2048,512,64,1].
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0]
+                                 [%c2,%c4,%c8,%c64]
+                                 [%c2048,%c512,%c64,%c1])
+        {metadata = @of_out, id = 1 : i64} : memref<8192xi32>
+
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0]
+                                [%c2,%c4,%c8,%c64]
+                                [%c2048,%c512,%c64,%c1])
+        {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<8192xi32>
+
+      aiex.npu.dma_wait {symbol = @of_out}
+
+      // Second 4-D pattern with different sizes/strides.
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0]
+                                 [%c1,%c8,%c16,%c32]
+                                 [%c4096,%c512,%c32,%c1])
+        {metadata = @of_out, id = 1 : i64} : memref<8192xi32>
+
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0]
+                                [%c1,%c8,%c16,%c32]
+                                [%c4096,%c512,%c32,%c1])
+        {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<8192xi32>
+
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/dialect/AIE/runtime_sequence_isolated.mlir b/test/dialect/AIE/runtime_sequence_isolated.mlir
new file mode 100644
index 00000000000..26d1463427d
--- /dev/null
+++ b/test/dialect/AIE/runtime_sequence_isolated.mlir
@@ -0,0 +1,63 @@
+//===- runtime_sequence_isolated.mlir - SCF in runtime_sequence --*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests that aie.runtime_sequence supports SCF ops (scf.for with iter_args,
+// scf.if with results) which are needed for dynamic TXN generation.
+// SCF ops are preserved through the compilation pipeline via
+// markOpRecursivelyLegal in aiecc's SCF→CF conversion.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-opt %s | FileCheck %s
+
+// CHECK: aie.runtime_sequence
+// CHECK: arith.divui
+// CHECK: scf.for
+// CHECK: scf.if
+// CHECK: aiex.npu.sync
+module {
+  aie.device(npu2) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    %rtp = aie.buffer(%tile_0_2) {sym_name = "rtp"} : memref<16xi32>
+
+    // Runtime sequence with SSA params and SCF loops.
+    aie.runtime_sequence(%in : memref<16xi32>, %out : memref<16xi32>, %n : i32) {
+      %c0 = arith.constant 0 : i32
+      %c1 = arith.constant 1 : i32
+      %c16 = arith.constant 16 : i32
+
+      %n_div_16 = arith.divui %n, %c16 : i32
+
+      %c0_idx = arith.index_cast %c0 : i32 to index
+      %c1_idx = arith.index_cast %c1 : i32 to index
+      %n_idx = arith.index_cast %n_div_16 : i32 to index
+
+      %result = scf.for %i = %c0_idx to %n_idx step %c1_idx
+          iter_args(%acc = %c0) -> (i32) {
+        %i32 = arith.index_cast %i : index to i32
+        %cmp = arith.cmpi sgt, %i32, %c0 : i32
+        %val = scf.if %cmp -> (i32) {
+          scf.yield %c1 : i32
+        } else {
+          scf.yield %c0 : i32
+        }
+        %new = arith.addi %acc, %val : i32
+        scf.yield %new : i32
+      }
+
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+  }
+}
diff --git a/test/npu-xrt/sync_task_complete_token/aie2.py b/test/npu-xrt/sync_task_complete_token/aie2.py
index 8ef7f71e0f8..9ea58056c16 100644
--- a/test/npu-xrt/sync_task_complete_token/aie2.py
+++ b/test/npu-xrt/sync_task_complete_token/aie2.py
@@ -18,7 +18,6 @@
 from aie.dialects.aiex import *
 from aie.iron.controlflow import range_
 
-
 dtype = T.i32
 output_sz = 16
 
diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py
index 81c7228e2da..672f42b2c51 100644
--- a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py
+++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py
@@ -18,7 +18,6 @@
 from aie.dialects.aiex import *
 from aie.iron.controlflow import range_
 
-
 dtype = T.i32
 output_sz = 16
 
diff --git a/test/python/buffer_resolution.py b/test/python/buffer_resolution.py
index 684d69e6c30..9b95774fb51 100644
--- a/test/python/buffer_resolution.py
+++ b/test/python/buffer_resolution.py
@@ -30,8 +30,8 @@
 # Test 1: Buffer given to a Worker is resolved before inline_ops fires,
 #         so element writes inside the callback produce correct rtp_write ops.
 # CHECK-LABEL: TEST: rtp_buffer_written_in_inline_ops
-# CHECK: aiex.npu.rtp_write(@my_rtp, 0, 7)
-# CHECK: aiex.npu.rtp_write(@my_rtp, 1, 3)
+# CHECK: aiex.npu.rtp_write(@my_rtp, 0 : ui32, 7 : i32)
+# CHECK: aiex.npu.rtp_write(@my_rtp, 1 : ui32, 3 : i32)
 # ---------------------------------------------------------------------------
 print("\nTEST: rtp_buffer_written_in_inline_ops")
 
@@ -70,9 +70,9 @@ def set_rtp(buf):
 # Test 2: Multiple RTP buffers (one per worker) in a list, all written in one
 #         inline_ops callback — mirrors the resnet layers_conv2_x pattern.
 # CHECK-LABEL: TEST: multiple_rtp_buffers_in_inline_ops
-# CHECK: aiex.npu.rtp_write(@rtp_w0, 0, 1)
-# CHECK: aiex.npu.rtp_write(@rtp_w1, 0, 2)
-# CHECK: aiex.npu.rtp_write(@rtp_w2, 0, 3)
+# CHECK: aiex.npu.rtp_write(@rtp_w0, 0 : ui32, 1 : i32)
+# CHECK: aiex.npu.rtp_write(@rtp_w1, 0 : ui32, 2 : i32)
+# CHECK: aiex.npu.rtp_write(@rtp_w2, 0 : ui32, 3 : i32)
 # ---------------------------------------------------------------------------
 print("\nTEST: multiple_rtp_buffers_in_inline_ops")
 
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index 82aea96f3e3..a2c85dec11c 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -61,6 +61,7 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
 #include "aie/InitialAllDialect.h"
+#include "aie/Targets/AIENpuLowering.h"
 #include "aie/Targets/AIETargets.h"
 #include "aie/version.h"
 
@@ -74,6 +75,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
@@ -85,6 +87,7 @@
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h"
@@ -226,6 +229,16 @@ static cl::opt<std::string>
               cl::desc("Output instructions filename for NPU target"),
               cl::init("{0}_{1}.bin"), cl::cat(aieCompilerOptions));
 
+static cl::opt<bool>
+    generateCppTxn("aie-generate-txn-cpp",
+                   cl::desc("Generate C++ code for runtime TXN generation"),
+                   cl::init(false), cl::cat(aieCompilerOptions));
+
+static cl::opt<std::string> cppTxnName("txn-cpp-name",
+                                       cl::desc("Output C++ TXN filename"),
+                                       cl::init("generated_txn.h"),
+                                       cl::cat(aieCompilerOptions));
+
 static cl::opt<bool> generateElf(
     "aie-generate-elf",
     cl::desc("Generate ELF for AIE control/configuration (via aiebu)"),
@@ -627,13 +640,19 @@ static bool executeCommand(ArrayRef<std::string> command,
 static std::string formatString(StringRef formatStr, StringRef deviceName,
                                 StringRef seqName = "") {
   std::string result = formatStr.str();
-  size_t pos = result.find("{0}");
-  if (pos != std::string::npos) {
-    result.replace(pos, 3, deviceName.str());
+  std::string devStr = deviceName.str();
+  size_t pos = 0;
+  while ((pos = result.find("{0}", pos)) != std::string::npos) {
+    result.replace(pos, 3, devStr);
+    pos += devStr.size();
   }
-  pos = result.find("{1}");
-  if (pos != std::string::npos && !seqName.empty()) {
-    result.replace(pos, 3, seqName.str());
+  if (!seqName.empty()) {
+    std::string seqStr = seqName.str();
+    pos = 0;
+    while ((pos = result.find("{1}", pos)) != std::string::npos) {
+      result.replace(pos, 3, seqStr);
+      pos += seqStr.size();
+    }
   }
   return result;
 }
@@ -761,7 +780,7 @@ static std::string discoverPeanoInstallDir() {
 }
 
 // Cached Peano install directory
-static std::optional<std::string> cachedPeanoDir;
+static std::string cachedPeanoDir;
 static std::once_flag peanoDirFlag;
 
 // Discover aietools installation directory by finding xchesscc in PATH
@@ -829,13 +848,13 @@ static StringRef getAietoolsDir() {
 static StringRef getPeanoInstallDir() {
   std::call_once(peanoDirFlag, [] {
     cachedPeanoDir = discoverPeanoInstallDir();
-    if (verbose && !cachedPeanoDir->empty()) {
+    if (verbose && !cachedPeanoDir.empty()) {
       std::lock_guard<std::mutex> lock(outputMutex);
-      llvm::outs() << "Discovered Peano installation: " << *cachedPeanoDir
+      llvm::outs() << "Discovered Peano installation: " << cachedPeanoDir
                    << "\n";
     }
   });
-  return *cachedPeanoDir;
+  return cachedPeanoDir;
 }
 
 // Downgrade LLVM IR for compatibility with Chess toolchain's older LLVM.
@@ -1143,12 +1162,15 @@ static CoreInfo getCoreInfo(xilinx::AIE::CoreOp coreOp) {
   // Prefer canonical link_files ArrayAttr (populated by AIEAssignCoreLinkFiles,
   // which runs as part of the resource-allocation pipeline).
   if (auto filesAttr = coreOp.getLinkFiles()) {
-    for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
-      info.linkFiles.push_back(f.getValue().str());
+    for (auto f : filesAttr->getAsRange<mlir::StringAttr>()) {
+      if (!f.getValue().empty())
+        info.linkFiles.push_back(f.getValue().str());
+    }
   } else if (auto linkWithAttr = coreOp.getLinkWithAttr()) {
     // Fallback: deprecated core-level link_with was not migrated by the pass
     // (e.g., pipeline was not run). Treat it as a single-element list.
-    info.linkFiles.push_back(linkWithAttr.getValue().str());
+    if (!linkWithAttr.getValue().empty())
+      info.linkFiles.push_back(linkWithAttr.getValue().str());
   }
 
   if (auto elfAttr = coreOp.getElfFileAttr()) {
@@ -1435,19 +1457,10 @@ static LogicalResult runResourceAllocationPipeline(ModuleOp moduleOp,
   // options (canonicalize, lower, optimize) through parseFromString, not
   // through direct member assignment. Without this, aie2p falls back to aie1
   // patterns.
-  std::string lowerTarget = aieTarget.lower();
-  if (lowerTarget == "aie2" || lowerTarget == "aieml" ||
-      lowerTarget == "aie2p") {
-    std::string vecPipeline =
-        "convert-vector-to-aievec{aie-target=" + lowerTarget +
-        " target-backend=llvmir" +
-        (bf16Emulation ? " bf16-emulation=true" : "") + "}";
-    if (failed(parsePassPipeline(vecPipeline, pm))) {
-      llvm::errs() << "Error: Failed to parse convert-vector-to-aievec "
-                      "pipeline\n";
-      return failure();
-    }
-  }
+  // Note: convert-vector-to-aievec is now in the per-core LLVM lowering
+  // pipeline, not here. Running it at the module level would vectorize
+  // arith ops inside runtime_sequence (e.g. arith.minsi -> aievec.min),
+  // breaking EmitC conversion for the C++ TXN code path.
 
   // Step 2: Lower affine
   pm.addPass(createLowerAffinePass());
@@ -1497,9 +1510,6 @@ static LogicalResult runResourceAllocationPipeline(ModuleOp moduleOp,
 
   devicePm.addPass(xilinx::AIE::createAIEVectorTransferLoweringPass());
 
-  // Step 5: Convert SCF to CF (module-level pass)
-  pm.addPass(createSCFToControlFlowPass());
-
   if (verbose) {
     llvm::outs() << "Running resource allocation pipeline in-memory "
                  << "(alloc-scheme=" << allocScheme.getValue() << ")\n";
@@ -1512,6 +1522,36 @@ static LogicalResult runResourceAllocationPipeline(ModuleOp moduleOp,
     return failure();
   }
 
+  // Step 5: Convert SCF to CF in aie.core bodies only.
+  // Walk each CoreOp and apply the conversion within its region. This avoids
+  // touching aie.runtime_sequence (which preserves SCF for EmitC codegen)
+  // without needing fragile ConversionTarget exclusion lists.
+  {
+    MLIRContext *ctx = moduleOp.getContext();
+    LogicalResult coreConvResult = success();
+    moduleOp.walk([&](xilinx::AIE::CoreOp coreOp) {
+      if (failed(coreConvResult))
+        return;
+      ConversionTarget scfTarget(*ctx);
+      scfTarget.addLegalDialect<cf::ControlFlowDialect>();
+      scfTarget.addLegalDialect<arith::ArithDialect>();
+      scfTarget.addLegalDialect<func::FuncDialect>();
+      scfTarget.addLegalDialect<memref::MemRefDialect>();
+      scfTarget.addLegalDialect<xilinx::AIE::AIEDialect>();
+      scfTarget.addLegalDialect<xilinx::AIEX::AIEXDialect>();
+      scfTarget.addIllegalDialect<scf::SCFDialect>();
+      RewritePatternSet scfPatterns(ctx);
+      populateSCFToControlFlowConversionPatterns(scfPatterns);
+      if (failed(applyPartialConversion(coreOp, scfTarget,
+                                        std::move(scfPatterns)))) {
+        coreOp.emitError("SCF to CF conversion failed");
+        coreConvResult = failure();
+      }
+    });
+    if (failed(coreConvResult))
+      return failure();
+  }
+
   if (verbose) {
     llvm::outs() << "Resource allocation pipeline completed successfully\n";
   }
@@ -1571,21 +1611,8 @@ static LogicalResult runNpuLoweringPipeline(ModuleOp moduleOp,
       pm.enableVerifier(true);
     }
 
-    // Add materialize runtime sequences pass at module level (before device
-    // nesting) unless --no-materialize is specified
-    if (!noMaterialize) {
-      pm.addPass(xilinx::AIEX::createAIEMaterializeRuntimeSequencesPass());
-    }
-
-    // Device-level passes
-    OpPassManager &devicePm = pm.nest<xilinx::AIE::DeviceOp>();
-    devicePm.addPass(xilinx::AIEX::createAIEMaterializeBDChainsPass());
-    devicePm.addPass(xilinx::AIEX::createAIESubstituteShimDMAAllocationsPass());
-    devicePm.addPass(xilinx::AIEX::createAIEAssignRuntimeSequenceBDIDsPass());
-    devicePm.addPass(createCanonicalizerPass());
-    devicePm.addPass(xilinx::AIEX::createAIEDMATasksToNPUPass());
-    devicePm.addPass(xilinx::AIEX::createAIEDmaToNpuPass());
-    devicePm.addPass(xilinx::AIEX::createAIELowerSetLockPass());
+    xilinx::AIE::populateNpuLoweringPipeline(pm,
+                                             /*skipMaterialize=*/noMaterialize);
 
     if (verbose) {
       llvm::outs() << "Running NPU lowering pipeline in-memory\n";
@@ -1771,6 +1798,19 @@ static LogicalResult runLLVMLoweringPipeline(ModuleOp moduleOp,
   devicePm.addPass(xilinx::AIE::createAIENormalizeAddressSpacesPass());
   devicePm.addPass(xilinx::AIEX::createAIETransformBfpTypesPass());
 
+  // Step 1b: Vector to AIEVec conversion (before core extraction)
+  {
+    std::string lt = aieTarget.lower();
+    if (lt == "aie2" || lt == "aieml" || lt == "aie2p") {
+      std::string vecPipeline = "convert-vector-to-aievec{aie-target=" + lt +
+                                " target-backend=llvmir}";
+      if (failed(parsePassPipeline(vecPipeline, pm))) {
+        llvm::errs() << "Error: Failed to parse convert-vector-to-aievec\n";
+        return failure();
+      }
+    }
+  }
+
   // Step 2: aie-standard-lowering with specific core coordinates
   // This extracts the specified core and removes the aie.device wrapper
   xilinx::AIE::AIECoreToStandardOptions coreOpts;
@@ -1788,6 +1828,7 @@ static LogicalResult runLLVMLoweringPipeline(ModuleOp moduleOp,
   pm.addPass(xilinx::aievec::createConvertAIEVecToLLVMPass(aievecOpts));
 
   // Step 5: Standard LLVM lowering passes
+  pm.addPass(createSCFToControlFlowPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
@@ -1869,6 +1910,7 @@ static LogicalResult runUnifiedLLVMLoweringPipeline(ModuleOp moduleOp,
   pm.addPass(xilinx::aievec::createConvertAIEVecToLLVMPass(aievecOpts));
 
   // Step 5: Standard LLVM lowering passes
+  pm.addPass(createSCFToControlFlowPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
@@ -2049,6 +2091,13 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
   // The pipeline is destructive (removes other cores), so we clone first.
   OwningOpRef<ModuleOp> coreModule = moduleOp.clone();
 
+  // Remove runtime_sequences from the clone — they're not needed for
+  // core compilation and their presence causes the canonicalizer (in
+  // convert-vector-to-aievec) to hoist constants from the sequence to
+  // device scope, creating cross-region references that crash during
+  // core extraction.
+  coreModule->walk([](xilinx::AIE::RuntimeSequenceOp seqOp) { seqOp.erase(); });
+
   // Register LLVM IR translation dialects
   mlir::registerBuiltinDialectTranslation(context);
   mlir::registerLLVMDialectTranslation(context);
@@ -2841,6 +2890,10 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
   // Step 1: Clone module and run unified LLVM lowering pipeline
   OwningOpRef<ModuleOp> unifiedModule = moduleOp.clone();
 
+  // Remove runtime_sequences from the clone (not needed for core compilation).
+  unifiedModule->walk(
+      [](xilinx::AIE::RuntimeSequenceOp seqOp) { seqOp.erase(); });
+
   // Register LLVM IR translation dialects
   mlir::registerBuiltinDialectTranslation(context);
   mlir::registerLLVMDialectTranslation(context);
@@ -3082,6 +3135,8 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
       // Search order: current working directory, tmpDirName, input file
       // directory
       for (const auto &linkWithFile : linkWithFiles) {
+        if (linkWithFile.empty())
+          continue;
         SmallString<256> srcPath;
         if (sys::path::is_absolute(linkWithFile)) {
           srcPath = linkWithFile;
@@ -3198,6 +3253,8 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
       // Handle external object files specified via link_files (or deprecated
       // link_with). Search order: absolute, cwd, tmpDirName, input file dir.
       for (const auto &lf : core.linkFiles) {
+        if (lf.empty())
+          continue;
         SmallString<256> srcLinkWith;
         if (sys::path::is_absolute(lf)) {
           srcLinkWith = lf;
@@ -3826,6 +3883,51 @@ static LogicalResult generateNpuInstructions(ModuleOp moduleOp,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// C++ TXN Generation
+//===----------------------------------------------------------------------===//
+
+/// Generate C++ code that builds TXN instruction binaries at runtime.
+/// Delegates to AIETranslateToCppTxn which clones the module internally,
+/// runs NPU lowering + EmitC conversion, and emits C++.
+static LogicalResult generateCppTxnCode(ModuleOp moduleOp, StringRef tmpDirName,
+                                        StringRef devName) {
+  if (!generateCppTxn)
+    return success();
+
+  if (verbose)
+    llvm::outs() << "Generating C++ TXN code for device: " << devName << "\n";
+
+  if (dryRun) {
+    if (verbose)
+      llvm::outs() << "Would generate C++ TXN code for device: " << devName
+                   << "\n";
+    return success();
+  }
+
+  // AIETranslateToCppTxn clones the module internally, so no need to clone
+  // here.
+  std::string outputFileName = formatString(cppTxnName, devName);
+
+  std::error_code ec;
+  raw_fd_ostream outFile(outputFileName, ec, sys::fs::OpenFlags::OF_None);
+  if (ec) {
+    llvm::errs() << "Error opening C++ TXN output file: " << ec.message()
+                 << "\n";
+    return failure();
+  }
+
+  if (failed(xilinx::AIE::AIETranslateToCppTxn(moduleOp, outFile))) {
+    llvm::errs() << "Error generating C++ TXN code\n";
+    return failure();
+  }
+
+  if (verbose)
+    llvm::outs() << "Wrote C++ TXN code to: " << outputFileName << "\n";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Transaction Generation
 //===----------------------------------------------------------------------===//
@@ -5316,6 +5418,13 @@ static LogicalResult compileAIEModule(MLIRContext &context, ModuleOp moduleOp,
       return failure();
     }
 
+    // Generate C++ TXN code if requested. This clones the module internally,
+    // so moduleOp must still be in its pre-NPU-lowered state (which it is,
+    // because generateNpuInstructions also works on a clone).
+    if (failed(generateCppTxnCode(moduleOp, tmpDirName, devName))) {
+      return failure();
+    }
+
     // Generate transaction MLIR output if requested.
     if (failed(generateTransactionOutput(moduleOp, tmpDirName, devName))) {
       return failure();