diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp
new file mode 100644
index 00000000000..2c79bcb6a59
--- /dev/null
+++ b/backends/cadence/fused_quant/op_bmm.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fused_quant/op_bmm.h>
+#include <executorch/backends/cadence/fused_quant/quant_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+namespace {
+
+void bmm_kernel(
+    const float* inp,
+    const float* other,
+    float* out,
+    int64_t batch,
+    int64_t M,
+    int64_t K,
+    int64_t N) {
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          sum += inp[b * M * K + m * K + k] * other[b * K * N + k * N + n];
+        }
+        out[b * M * N + m * N + n] = sum;
+      }
+    }
+  }
+}
+
+} // namespace
+
+Tensor& bmm_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& inp,
+    const Tensor& other,
+    const optional<Tensor>& inp_scale,
+    const optional<Tensor>& inp_zero_point,
+    ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    optional<int64_t> inp_axis,
+    const optional<Tensor>& other_scale,
+    const optional<Tensor>& other_zero_point,
+    ScalarType other_dtype,
+    int64_t other_quant_min,
+    int64_t other_quant_max,
+    optional<int64_t> other_axis,
+    const optional<Tensor>& out_scale,
+    const optional<Tensor>& out_zero_point,
+    ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    optional<int64_t> out_axis,
+    Tensor& out) {
+  int64_t batch = inp.size(0);
+  int64_t M = inp.size(1);
+  int64_t K = inp.size(2);
+  int64_t N = other.size(2);
+  int64_t inp_numel = inp.numel();
+  int64_t other_numel = other.numel();
+  int64_t out_numel = batch * M * N;
+
+  bool inp_quantized = inp_scale.has_value();
+  bool other_quantized = other_scale.has_value();
+  bool out_quantized = out_scale.has_value();
+
+  // Dequantize inp
+  std::vector<float> inp_buf;
+  const float* const inp_float = [&]() -> const float* {
+    if (!inp_quantized) {
+      return inp.const_data_ptr<float>();
+    }
+    inp_buf.resize(inp_numel);
+    QParams qp = extract_qparams(
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+    FUSED_QUANT_DTYPE_SWITCH(
+        inp.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            inp.const_data_ptr<scalar_t>(), inp_buf.data(), inp_numel, qp);)
+    return inp_buf.data();
+  }();
+
+  // Dequantize other
+  std::vector<float> other_buf;
+  const float* const other_float = [&]() -> const float* {
+    if (!other_quantized) {
+      return other.const_data_ptr<float>();
+    }
+    other_buf.resize(other_numel);
+    QParams qp = extract_qparams(
+        other_scale,
+        other_zero_point,
+        other_quant_min,
+        other_quant_max,
+        other_axis,
+        other);
+    FUSED_QUANT_DTYPE_SWITCH(other.scalar_type(),
+                             scalar_t,
+                             dequantize_buffer(
+                                 other.const_data_ptr<scalar_t>(),
+                                 other_buf.data(),
+                                 other_numel,
+                                 qp);)
+    return other_buf.data();
+  }();
+
+  // BMM in float, then optionally quantize output
+  if (out_quantized) {
+    std::vector<float> result_float(out_numel);
+    bmm_kernel(inp_float, other_float, result_float.data(), batch, M, K, N);
+
+    QParams qp = extract_qparams(
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+    FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(),
+                             scalar_t,
+                             quantize_buffer(
+                                 result_float.data(),
+                                 out.mutable_data_ptr<scalar_t>(),
+                                 out_numel,
+                                 qp);)
+  } else {
+    bmm_kernel(
+        inp_float, other_float, out.mutable_data_ptr<float>(), batch, M, K, N);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h
new file mode 100644
index 00000000000..f814b46b481
--- /dev/null
+++ b/backends/cadence/fused_quant/op_bmm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+executorch::aten::Tensor& bmm_out(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    const executorch::aten::Tensor& inp,
+    const executorch::aten::Tensor& other,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    executorch::aten::ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    executorch::aten::optional<int64_t> inp_axis,
+    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        other_zero_point,
+    executorch::aten::ScalarType other_dtype,
+    int64_t other_quant_min,
+    int64_t other_quant_max,
+    executorch::aten::optional<int64_t> other_axis,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    executorch::aten::ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    executorch::aten::optional<int64_t> out_axis,
+    executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/targets.bzl b/backends/cadence/fused_quant/targets.bzl
index 902d4d2727f..f98a357ae90 100644
--- a/backends/cadence/fused_quant/targets.bzl
+++ b/backends/cadence/fused_quant/targets.bzl
@@ -58,3 +58,15 @@ def define_common_targets():
         ],
         visibility = ["PUBLIC"],
     )
+
+    runtime.cxx_library(
+        name = "op_bmm",
+        srcs = ["op_bmm.cpp"],
+        exported_headers = ["op_bmm.h"],
+        platforms = CXX,
+        deps = [
+            ":quant_utils",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
diff --git a/backends/cadence/fused_quant/tests/BUCK b/backends/cadence/fused_quant/tests/BUCK
index c503049dc69..90b3af0aa45 100644
--- a/backends/cadence/fused_quant/tests/BUCK
+++ b/backends/cadence/fused_quant/tests/BUCK
@@ -46,3 +46,14 @@ runtime.cxx_test(
         "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
     ],
 )
+
+runtime.cxx_test(
+    name = "test_op_bmm",
+    srcs = ["test_op_bmm.cpp"],
+    platforms = CXX,
+    deps = [
+        "//executorch/backends/cadence/fused_quant:op_bmm",
+        "//executorch/kernels/test:gtest_utils",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
new file mode 100644
index 00000000000..93c511a10d5
--- /dev/null
+++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/cadence/fused_quant/op_bmm.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+optional<Tensor> none_tensor() {
+  return optional<Tensor>();
+}
+
+optional<int64_t> none_axis() {
+  return optional<int64_t>();
+}
+
+} // namespace
+
+class FusedQuantBmmTest : public OperatorTest {};
+
+// All quantized: int8 × int8 → int8 (per-tensor)
+TEST_F(FusedQuantBmmTest, AllQuantizedPerTensor) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2,2]: identity matrix {{1,0},{0,1}} quantized as int8
+  // other [1,2,2]: {{1,2},{3,4}} quantized as int8
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // scale=0.5, zp=0: int8 value v maps to v * 0.5
+  // identity: {1,0,0,1} -> int8 {2,0,0,2}
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  // {{1,2},{3,4}} -> int8 {2,4,6,8}
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// float × float → int8
+TEST_F(FusedQuantBmmTest, FloatInputsQuantizedOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // identity
+  Tensor inp = tf_float.make(inp_sizes, {1.0, 0.0, 0.0, 1.0});
+  Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0});
+
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// int8 × int8 → float
+TEST_F(FusedQuantBmmTest, QuantizedInputsFloatOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}
+  // bmm: {{1,2},{3,4}}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make(out_sizes, {1.0, 2.0, 3.0, 4.0}));
+}
+
+// int8 × float → int8
+TEST_F(FusedQuantBmmTest, QuantizedInpFloatOther) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// Non-zero zero_point
+TEST_F(FusedQuantBmmTest, NonZeroZeroPoint) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // scale=0.25, zp=2: int8 value v maps to (v - 2) * 0.25
+  // inp: {{1,0.5},{0.5,1}} -> int8: (1/0.25)+2=6, (0.5/0.25)+2=4, 4, 6
+  Tensor inp = tf_int8.make(inp_sizes, {6, 4, 4, 6});
+  // other: {{1,2},{0,1}} -> int8: (1/0.25)+2=6, (2/0.25)+2=10, (0/0.25)+2=2,
+  // (1/0.25)+2=6
+  Tensor other = tf_int8.make(other_sizes, {6, 10, 2, 6});
+
+  Tensor inp_scale = tf_float.make({1}, {0.25});
+  Tensor inp_zp = tf_long.make({1}, {2});
+  Tensor other_scale = tf_float.make({1}, {0.25});
+  Tensor other_zp = tf_long.make({1}, {2});
+  // out: scale=0.5, zp=1 -> float f maps to round(f / 0.5) + 1
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {1});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: (6-2)*0.25=1, (4-2)*0.25=0.5, (4-2)*0.25=0.5, (6-2)*0.25=1
+  //   -> {{1, 0.5}, {0.5, 1}}
+  // dequant other: (6-2)*0.25=1, (10-2)*0.25=2, (2-2)*0.25=0, (6-2)*0.25=1
+  //   -> {{1, 2}, {0, 1}}
+  // bmm: {{1*1+0.5*0, 1*2+0.5*1}, {0.5*1+1*0, 0.5*2+1*1}}
+  //    = {{1, 2.5}, {0.5, 2}}
+  // requant (scale=0.5, zp=1):
+  //   round(1/0.5)+1=3, round(2.5/0.5)+1=6,
+  //   round(0.5/0.5)+1=2, round(2/0.5)+1=5
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {3, 6, 2, 5}));
+}
+
+// batch=2, verify both batch elements
+TEST_F(FusedQuantBmmTest, LargerBatch) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [2,2,2]: two identity matrices
+  // other [2,2,2]: batch 0 = {{1,2},{3,4}}, batch 1 = {{5,6},{7,8}}
+  const std::vector<int> inp_sizes{2, 2, 2};
+  const std::vector<int> other_sizes{2, 2, 2};
+  const std::vector<int> out_sizes{2, 2, 2};
+
+  // scale=0.5, zp=0: two identity matrices as int8
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2, 2, 0, 0, 2});
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8, 10, 12, 14, 16});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: two identity matrices {{1,0},{0,1}}, {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}, {{5,6},{7,8}}
+  // bmm batch 0: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // bmm batch 1: I * {{5,6},{7,8}} = {{5,6},{7,8}}
+  // requant (scale=0.5, zp=0): {2,4,6,8, 10,12,14,16}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8, 10, 12, 14, 16}));
+}