diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp
new file mode 100644
index 00000000000..2c79bcb6a59
--- /dev/null
+++ b/backends/cadence/fused_quant/op_bmm.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fused_quant/op_bmm.h>
+#include <executorch/backends/cadence/fused_quant/quant_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+namespace {
+
+void bmm_kernel(
+    const float* inp,
+    const float* other,
+    float* out,
+    int64_t batch,
+    int64_t M,
+    int64_t K,
+    int64_t N) {
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          sum += inp[b * M * K + m * K + k] * other[b * K * N + k * N + n];
+        }
+        out[b * M * N + m * N + n] = sum;
+      }
+    }
+  }
+}
+
+} // namespace
+
+Tensor& bmm_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& inp,
+    const Tensor& other,
+    const optional<Tensor>& inp_scale,
+    const optional<Tensor>& inp_zero_point,
+    ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    optional<int64_t> inp_axis,
+    const optional<Tensor>& other_scale,
+    const optional<Tensor>& other_zero_point,
+    ScalarType other_dtype,
+    int64_t other_quant_min,
+    int64_t other_quant_max,
+    optional<int64_t> other_axis,
+    const optional<Tensor>& out_scale,
+    const optional<Tensor>& out_zero_point,
+    ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    optional<int64_t> out_axis,
+    Tensor& out) {
+  int64_t batch = inp.size(0);
+  int64_t M = inp.size(1);
+  int64_t K = inp.size(2);
+  int64_t N = other.size(2);
+  int64_t inp_numel = inp.numel();
+  int64_t other_numel = other.numel();
+  int64_t out_numel = batch * M * N;
+
+  bool inp_quantized = inp_scale.has_value();
+  bool other_quantized = other_scale.has_value();
+  bool out_quantized = out_scale.has_value();
+
+  // Dequantize inp
+  std::vector<float> inp_buf;
+  const float* const inp_float = [&]() -> const float* {
+    if (!inp_quantized) {
+      return inp.const_data_ptr<float>();
+    }
+    inp_buf.resize(inp_numel);
+    QParams qp = extract_qparams(
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+    FUSED_QUANT_DTYPE_SWITCH(
+        inp.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            inp.const_data_ptr<scalar_t>(), inp_buf.data(), inp_numel, qp);)
+    return inp_buf.data();
+  }();
+
+  // Dequantize other
+  std::vector<float> other_buf;
+  const float* const other_float = [&]() -> const float* {
+    if (!other_quantized) {
+      return other.const_data_ptr<float>();
+    }
+    other_buf.resize(other_numel);
+    QParams qp = extract_qparams(
+        other_scale,
+        other_zero_point,
+        other_quant_min,
+        other_quant_max,
+        other_axis,
+        other);
+    FUSED_QUANT_DTYPE_SWITCH(other.scalar_type(),
+                             scalar_t,
+                             dequantize_buffer(
+                                 other.const_data_ptr<scalar_t>(),
+                                 other_buf.data(),
+                                 other_numel,
+                                 qp);)
+    return other_buf.data();
+  }();
+
+  // BMM in float, then optionally quantize output
+  if (out_quantized) {
+    std::vector<float> result_float(out_numel);
+    bmm_kernel(inp_float, other_float, result_float.data(), batch, M, K, N);
+
+    QParams qp = extract_qparams(
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+    FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(),
+                             scalar_t,
+                             quantize_buffer(
+                                 result_float.data(),
+                                 out.mutable_data_ptr<scalar_t>(),
+                                 out_numel,
+                                 qp);)
+  } else {
+    bmm_kernel(
+        inp_float, other_float, out.mutable_data_ptr<float>(), batch, M, K, N);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h
new file mode 100644
index 00000000000..f814b46b481
--- /dev/null
+++ b/backends/cadence/fused_quant/op_bmm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+executorch::aten::Tensor& bmm_out(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    const executorch::aten::Tensor& inp,
+    const executorch::aten::Tensor& other,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    executorch::aten::ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    executorch::aten::optional<int64_t> inp_axis,
+    const executorch::aten::optional<executorch::aten::Tensor>& other_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        other_zero_point,
+    executorch::aten::ScalarType other_dtype,
+    int64_t other_quant_min,
+    int64_t other_quant_max,
+    executorch::aten::optional<int64_t> other_axis,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    executorch::aten::ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    executorch::aten::optional<int64_t> out_axis,
+    executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_convolution.cpp b/backends/cadence/fused_quant/op_convolution.cpp
new file mode 100644
index 00000000000..bb713f61093
--- /dev/null
+++ b/backends/cadence/fused_quant/op_convolution.cpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fused_quant/op_convolution.h>
+#include <executorch/backends/cadence/fused_quant/quant_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+using executorch::aten::IntArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+namespace {
+
+void conv2d_kernel(
+    const float* inp,
+    const float* weight,
+    const float* bias,
+    float* out,
+    int64_t N,
+    int64_t C_in,
+    int64_t H_in,
+    int64_t W_in,
+    int64_t C_out,
+    int64_t kH,
+    int64_t kW,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_h,
+    int64_t pad_w,
+    int64_t dil_h,
+    int64_t dil_w,
+    int64_t groups,
+    int64_t H_out,
+    int64_t W_out) {
+  int64_t C_in_per_group = C_in / groups;
+  int64_t C_out_per_group = C_out / groups;
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t g = 0; g < groups; ++g) {
+      for (int64_t oc = 0; oc < C_out_per_group; ++oc) {
+        int64_t oc_global = g * C_out_per_group + oc;
+        for (int64_t oh = 0; oh < H_out; ++oh) {
+          for (int64_t ow = 0; ow < W_out; ++ow) {
+            float sum = bias ? bias[oc_global] : 0.0f;
+            for (int64_t ic = 0; ic < C_in_per_group; ++ic) {
+              int64_t ic_global = g * C_in_per_group + ic;
+              for (int64_t kh = 0; kh < kH; ++kh) {
+                for (int64_t kw = 0; kw < kW; ++kw) {
+                  int64_t ih = oh * stride_h - pad_h + kh * dil_h;
+                  int64_t iw = ow * stride_w - pad_w + kw * dil_w;
+                  if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) {
+                    float inp_val =
+                        inp[((n * C_in + ic_global) * H_in + ih) * W_in + iw];
+                    float w_val = weight
+                        [((oc_global * C_in_per_group + ic) * kH + kh) * kW +
+                         kw];
+                    sum += inp_val * w_val;
+                  }
+                }
+              }
+            }
+            out[((n * C_out + oc_global) * H_out + oh) * W_out + ow] = sum;
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+Tensor& convolution_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& inp,
+    const Tensor& weight,
+    const optional<Tensor>& bias,
+    // inp qparams
+    const optional<Tensor>& inp_scale,
+    const optional<Tensor>& inp_zero_point,
+    ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    optional<int64_t> inp_axis,
+    // weight qparams
+    const optional<Tensor>& weight_scale,
+    const optional<Tensor>& weight_zero_point,
+    ScalarType weight_dtype,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    optional<int64_t> weight_axis,
+    // bias qparams
+    const optional<Tensor>& bias_scale,
+    const optional<Tensor>& bias_zero_point,
+    ScalarType bias_dtype,
+    int64_t bias_quant_min,
+    int64_t bias_quant_max,
+    optional<int64_t> bias_axis,
+    // out qparams
+    const optional<Tensor>& out_scale,
+    const optional<Tensor>& out_zero_point,
+    ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    optional<int64_t> out_axis,
+    // conv params
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    Tensor& out) {
+  // Extract dimensions from input tensor [N, C_in, H_in, W_in]
+  int64_t N = inp.size(0);
+  int64_t C_in = inp.size(1);
+  int64_t H_in = inp.size(2);
+  int64_t W_in = inp.size(3);
+
+  // Extract dimensions from weight tensor [C_out, C_in/groups, kH, kW]
+  int64_t C_out = weight.size(0);
+  int64_t kH = weight.size(2);
+  int64_t kW = weight.size(3);
+
+  int64_t stride_h = stride[0];
+  int64_t stride_w = stride[1];
+  int64_t pad_h = padding[0];
+  int64_t pad_w = padding[1];
+  int64_t dil_h = dilation[0];
+  int64_t dil_w = dilation[1];
+
+  int64_t H_out = (H_in + 2 * pad_h - dil_h * (kH - 1) - 1) / stride_h + 1;
+  int64_t W_out = (W_in + 2 * pad_w - dil_w * (kW - 1) - 1) / stride_w + 1;
+
+  int64_t inp_numel = inp.numel();
+  int64_t weight_numel = weight.numel();
+  int64_t out_numel = N * C_out * H_out * W_out;
+
+  bool inp_quantized = inp_scale.has_value();
+  bool weight_quantized = weight_scale.has_value();
+  bool bias_quantized = bias_scale.has_value();
+  bool out_quantized = out_scale.has_value();
+
+  // Dequantize input if quantized
+  std::vector<float> inp_buf;
+  const float* const inp_float = [&]() -> const float* {
+    if (!inp_quantized) {
+      return inp.const_data_ptr<float>();
+    }
+    inp_buf.resize(inp_numel);
+    QParams qp = extract_qparams(
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+    FUSED_QUANT_DTYPE_SWITCH(
+        inp.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            inp.const_data_ptr<scalar_t>(), inp_buf.data(), inp_numel, qp);)
+    return inp_buf.data();
+  }();
+
+  // Dequantize weight if quantized
+  std::vector<float> weight_buf;
+  const float* const weight_float = [&]() -> const float* {
+    if (!weight_quantized) {
+      return weight.const_data_ptr<float>();
+    }
+    weight_buf.resize(weight_numel);
+    QParams qp = extract_qparams(
+        weight_scale,
+        weight_zero_point,
+        weight_quant_min,
+        weight_quant_max,
+        weight_axis,
+        weight);
+    FUSED_QUANT_DTYPE_SWITCH(weight.scalar_type(),
+                             scalar_t,
+                             dequantize_buffer(
+                                 weight.const_data_ptr<scalar_t>(),
+                                 weight_buf.data(),
+                                 weight_numel,
+                                 qp);)
+    return weight_buf.data();
+  }();
+
+  // Dequantize bias if present and quantized
+  std::vector<float> bias_buf;
+  const float* bias_float = nullptr;
+  if (bias.has_value()) {
+    const Tensor& bias_tensor = bias.value();
+    if (bias_quantized) {
+      int64_t bias_numel = bias_tensor.numel();
+      bias_buf.resize(bias_numel);
+      QParams qp = extract_qparams(
+          bias_scale,
+          bias_zero_point,
+          bias_quant_min,
+          bias_quant_max,
+          bias_axis,
+          bias_tensor);
+      FUSED_QUANT_DTYPE_SWITCH(bias_tensor.scalar_type(),
+                               scalar_t,
+                               dequantize_buffer(
+                                   bias_tensor.const_data_ptr<scalar_t>(),
+                                   bias_buf.data(),
+                                   bias_numel,
+                                   qp);)
+      bias_float = bias_buf.data();
+    } else {
+      bias_float = bias_tensor.const_data_ptr<float>();
+    }
+  }
+
+  // Run convolution
+  if (out_quantized) {
+    std::vector<float> result_float(out_numel);
+    conv2d_kernel(
+        inp_float,
+        weight_float,
+        bias_float,
+        result_float.data(),
+        N,
+        C_in,
+        H_in,
+        W_in,
+        C_out,
+        kH,
+        kW,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dil_h,
+        dil_w,
+        groups,
+        H_out,
+        W_out);
+
+    QParams qp = extract_qparams(
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+    FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(),
+                             scalar_t,
+                             quantize_buffer(
+                                 result_float.data(),
+                                 out.mutable_data_ptr<scalar_t>(),
+                                 out_numel,
+                                 qp);)
+  } else {
+    conv2d_kernel(
+        inp_float,
+        weight_float,
+        bias_float,
+        out.mutable_data_ptr<float>(),
+        N,
+        C_in,
+        H_in,
+        W_in,
+        C_out,
+        kH,
+        kW,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dil_h,
+        dil_w,
+        groups,
+        H_out,
+        W_out);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_convolution.h b/backends/cadence/fused_quant/op_convolution.h
new file mode 100644
index 00000000000..8bc9a7200b7
--- /dev/null
+++ b/backends/cadence/fused_quant/op_convolution.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+executorch::aten::Tensor& convolution_out(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    const executorch::aten::Tensor& inp,
+    const executorch::aten::Tensor& weight,
+    const executorch::aten::optional<executorch::aten::Tensor>& bias,
+    // inp qparams (6)
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    executorch::aten::ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    executorch::aten::optional<int64_t> inp_axis,
+    // weight qparams (6)
+    const executorch::aten::optional<executorch::aten::Tensor>& weight_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        weight_zero_point,
+    executorch::aten::ScalarType weight_dtype,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    executorch::aten::optional<int64_t> weight_axis,
+    // bias qparams (6)
+    const executorch::aten::optional<executorch::aten::Tensor>& bias_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& bias_zero_point,
+    executorch::aten::ScalarType bias_dtype,
+    int64_t bias_quant_min,
+    int64_t bias_quant_max,
+    executorch::aten::optional<int64_t> bias_axis,
+    // out qparams (6)
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    executorch::aten::ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    executorch::aten::optional<int64_t> out_axis,
+    // conv params
+    executorch::aten::IntArrayRef stride,
+    executorch::aten::IntArrayRef padding,
+    executorch::aten::IntArrayRef dilation,
+    int64_t groups,
+    executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_hardswish.cpp b/backends/cadence/fused_quant/op_hardswish.cpp
new file mode 100644
index 00000000000..0d653a1bfae
--- /dev/null
+++ b/backends/cadence/fused_quant/op_hardswish.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+
+#include <executorch/backends/cadence/fused_quant/op_hardswish.h>
+#include <executorch/backends/cadence/fused_quant/quant_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+namespace {
+
+void hardswish_kernel(const float* inp, float* out, int64_t numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    float x = inp[i];
+    out[i] = x * std::min(std::max(x + 3.0f, 0.0f), 6.0f) / 6.0f;
+  }
+}
+
+} // namespace
+
+Tensor& hardswish_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& inp,
+    const optional<Tensor>& inp_scale,
+    const optional<Tensor>& inp_zero_point,
+    ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    optional<int64_t> inp_axis,
+    const optional<Tensor>& out_scale,
+    const optional<Tensor>& out_zero_point,
+    ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    optional<int64_t> out_axis,
+    Tensor& out) {
+  int64_t numel = inp.numel();
+
+  bool inp_quantized = inp_scale.has_value();
+  bool out_quantized = out_scale.has_value();
+
+  std::vector<float> inp_buf;
+  const float* const inp_float = [&]() -> const float* {
+    if (!inp_quantized) {
+      return inp.const_data_ptr<float>();
+    }
+    inp_buf.resize(numel);
+    QParams qp = extract_qparams(
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+    FUSED_QUANT_DTYPE_SWITCH(
+        inp.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            inp.const_data_ptr<scalar_t>(), inp_buf.data(), numel, qp);)
+    return inp_buf.data();
+  }();
+
+  if (out_quantized) {
+    std::vector<float> result_float(numel);
+    hardswish_kernel(inp_float, result_float.data(), numel);
+
+    QParams qp = extract_qparams(
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+    FUSED_QUANT_DTYPE_SWITCH(
+        out.scalar_type(),
+        scalar_t,
+        quantize_buffer(
+            result_float.data(), out.mutable_data_ptr<scalar_t>(), numel, qp);)
+  } else {
+    hardswish_kernel(inp_float, out.mutable_data_ptr<float>(), numel);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_hardswish.h b/backends/cadence/fused_quant/op_hardswish.h
new file mode 100644
index 00000000000..7cba5b07788
--- /dev/null
+++ b/backends/cadence/fused_quant/op_hardswish.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+executorch::aten::Tensor& hardswish_out(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    const executorch::aten::Tensor& inp,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    executorch::aten::ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    executorch::aten::optional<int64_t> inp_axis,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    executorch::aten::ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    executorch::aten::optional<int64_t> out_axis,
+    executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_linear.cpp b/backends/cadence/fused_quant/op_linear.cpp
new file mode 100644
index 00000000000..be846fd5ede
--- /dev/null
+++ b/backends/cadence/fused_quant/op_linear.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fused_quant/op_linear.h>
+#include <executorch/backends/cadence/fused_quant/quant_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+namespace {
+
+void linear_kernel(
+    const float* inp,
+    const float* weight,
+    const float* bias,
+    float* out,
+    int64_t num_rows,
+    int64_t in_features,
+    int64_t out_features) {
+  for (int64_t r = 0; r < num_rows; ++r) {
+    for (int64_t o = 0; o < out_features; ++o) {
+      float sum = bias ? bias[o] : 0.0f;
+      for (int64_t i = 0; i < in_features; ++i) {
+        sum += inp[r * in_features + i] * weight[o * in_features + i];
+      }
+      out[r * out_features + o] = sum;
+    }
+  }
+}
+
+} // namespace
+
+Tensor& linear_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& inp,
+    const Tensor& weight,
+    const optional<Tensor>& bias,
+    // inp qparams
+    const optional<Tensor>& inp_scale,
+    const optional<Tensor>& inp_zero_point,
+    ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    optional<int64_t> inp_axis,
+    // weight qparams
+    const optional<Tensor>& weight_scale,
+    const optional<Tensor>& weight_zero_point,
+    ScalarType weight_dtype,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    optional<int64_t> weight_axis,
+    // bias qparams
+    const optional<Tensor>& bias_scale,
+    const optional<Tensor>& bias_zero_point,
+    ScalarType bias_dtype,
+    int64_t bias_quant_min,
+    int64_t bias_quant_max,
+    optional<int64_t> bias_axis,
+    // out qparams
+    const optional<Tensor>& out_scale,
+    const optional<Tensor>& out_zero_point,
+    ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    optional<int64_t> out_axis,
+    Tensor& out) {
+  int64_t in_features = inp.size(inp.dim() - 1);
+  int64_t out_features = weight.size(0);
+  int64_t num_rows = inp.numel() / in_features;
+  int64_t inp_numel = inp.numel();
+  int64_t weight_numel = weight.numel();
+  int64_t out_numel = num_rows * out_features;
+
+  bool inp_quantized = inp_scale.has_value();
+  bool weight_quantized = weight_scale.has_value();
+  bool bias_quantized = bias_scale.has_value();
+  bool out_quantized = out_scale.has_value();
+
+  // Dequantize inp
+  std::vector<float> inp_buf;
+  const float* const inp_float = [&]() -> const float* {
+    if (!inp_quantized) {
+      return inp.const_data_ptr<float>();
+    }
+    inp_buf.resize(inp_numel);
+    QParams qp = extract_qparams(
+        inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp);
+    FUSED_QUANT_DTYPE_SWITCH(
+        inp.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            inp.const_data_ptr<scalar_t>(), inp_buf.data(), inp_numel, qp);)
+    return inp_buf.data();
+  }();
+
+  // Dequantize weight
+  std::vector<float> weight_buf;
+  const float* const weight_float = [&]() -> const float* {
+    if (!weight_quantized) {
+      return weight.const_data_ptr<float>();
+    }
+    weight_buf.resize(weight_numel);
+    QParams qp = extract_qparams(
+        weight_scale,
+        weight_zero_point,
+        weight_quant_min,
+        weight_quant_max,
+        weight_axis,
+        weight);
+    FUSED_QUANT_DTYPE_SWITCH(weight.scalar_type(),
+                             scalar_t,
+                             dequantize_buffer(
+                                 weight.const_data_ptr<scalar_t>(),
+                                 weight_buf.data(),
+                                 weight_numel,
+                                 qp);)
+    return weight_buf.data();
+  }();
+
+  // Dequantize bias if present and quantized
+  std::vector<float> bias_buf;
+  const float* const bias_float = [&]() -> const float* {
+    if (!bias.has_value()) {
+      return nullptr;
+    }
+    const Tensor& b = bias.value();
+    if (!bias_quantized) {
+      return b.const_data_ptr<float>();
+    }
+    int64_t bias_numel = b.numel();
+    bias_buf.resize(bias_numel);
+    QParams qp = extract_qparams(
+        bias_scale,
+        bias_zero_point,
+        bias_quant_min,
+        bias_quant_max,
+        bias_axis,
+        b);
+    FUSED_QUANT_DTYPE_SWITCH(
+        b.scalar_type(),
+        scalar_t,
+        dequantize_buffer(
+            b.const_data_ptr<scalar_t>(), bias_buf.data(), bias_numel, qp);)
+    return bias_buf.data();
+  }();
+
+  // Linear + optional quantize
+  if (out_quantized) {
+    std::vector<float> result_float(out_numel);
+    linear_kernel(
+        inp_float,
+        weight_float,
+        bias_float,
+        result_float.data(),
+        num_rows,
+        in_features,
+        out_features);
+    QParams qp = extract_qparams(
+        out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out);
+    FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(),
+                             scalar_t,
+                             quantize_buffer(
+                                 result_float.data(),
+                                 out.mutable_data_ptr<scalar_t>(),
+                                 out_numel,
+                                 qp);)
+  } else {
+    linear_kernel(
+        inp_float,
+        weight_float,
+        bias_float,
+        out.mutable_data_ptr<float>(),
+        num_rows,
+        in_features,
+        out_features);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/op_linear.h b/backends/cadence/fused_quant/op_linear.h
new file mode 100644
index 00000000000..99d20ba5bbc
--- /dev/null
+++ b/backends/cadence/fused_quant/op_linear.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace cadence {
+namespace fused_quant {
+namespace native {
+
+executorch::aten::Tensor& linear_out(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    const executorch::aten::Tensor& inp,
+    const executorch::aten::Tensor& weight,
+    const executorch::aten::optional<executorch::aten::Tensor>& bias,
+    // inp qparams
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& inp_zero_point,
+    executorch::aten::ScalarType inp_dtype,
+    int64_t inp_quant_min,
+    int64_t inp_quant_max,
+    executorch::aten::optional<int64_t> inp_axis,
+    // weight qparams
+    const executorch::aten::optional<executorch::aten::Tensor>& weight_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>&
+        weight_zero_point,
+    executorch::aten::ScalarType weight_dtype,
+    int64_t weight_quant_min,
+    int64_t weight_quant_max,
+    executorch::aten::optional<int64_t> weight_axis,
+    // bias qparams
+    const executorch::aten::optional<executorch::aten::Tensor>& bias_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& bias_zero_point,
+    executorch::aten::ScalarType bias_dtype,
+    int64_t bias_quant_min,
+    int64_t bias_quant_max,
+    executorch::aten::optional<int64_t> bias_axis,
+    // out qparams
+    const executorch::aten::optional<executorch::aten::Tensor>& out_scale,
+    const executorch::aten::optional<executorch::aten::Tensor>& out_zero_point,
+    executorch::aten::ScalarType out_dtype,
+    int64_t out_quant_min,
+    int64_t out_quant_max,
+    executorch::aten::optional<int64_t> out_axis,
+    executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace fused_quant
+} // namespace cadence
diff --git a/backends/cadence/fused_quant/targets.bzl b/backends/cadence/fused_quant/targets.bzl
index 0995f73e9e8..3e8d53a94c0 100644
--- a/backends/cadence/fused_quant/targets.bzl
+++ b/backends/cadence/fused_quant/targets.bzl
@@ -46,3 +46,51 @@ def define_common_targets():
         ],
         visibility = ["PUBLIC"],
     )
+
+    runtime.cxx_library(
+        name = "op_hardswish",
+        srcs = ["op_hardswish.cpp"],
+        exported_headers = ["op_hardswish.h"],
+        platforms = CXX,
+        deps = [
+            ":quant_utils",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "op_bmm",
+        srcs = ["op_bmm.cpp"],
+        exported_headers = ["op_bmm.h"],
+        platforms = CXX,
+        deps = [
+            ":quant_utils",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "op_linear",
+        srcs = ["op_linear.cpp"],
+        exported_headers = ["op_linear.h"],
+        platforms = CXX,
+        deps = [
+            ":quant_utils",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "op_convolution",
+        srcs = ["op_convolution.cpp"],
+        exported_headers = ["op_convolution.h"],
+        platforms = CXX,
+        deps = [
+            ":quant_utils",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["PUBLIC"],
+    )
diff --git a/backends/cadence/fused_quant/tests/BUCK b/backends/cadence/fused_quant/tests/BUCK
index f20c4472c57..162c662082c 100644
--- a/backends/cadence/fused_quant/tests/BUCK
+++ b/backends/cadence/fused_quant/tests/BUCK
@@ -35,3 +35,47 @@ runtime.cxx_test(
         "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
     ],
 )
+
+runtime.cxx_test(
+    name = "test_op_hardswish",
+    srcs = ["test_op_hardswish.cpp"],
+    platforms = CXX,
+    deps = [
+        "//executorch/backends/cadence/fused_quant:op_hardswish",
+        "//executorch/kernels/test:gtest_utils",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
+
+runtime.cxx_test(
+    name = "test_op_bmm",
+    srcs = ["test_op_bmm.cpp"],
+    platforms = CXX,
+    deps = [
+        "//executorch/backends/cadence/fused_quant:op_bmm",
+        "//executorch/kernels/test:gtest_utils",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
+
+runtime.cxx_test(
+    name = "test_op_linear",
+    srcs = ["test_op_linear.cpp"],
+    platforms = CXX,
+    deps = [
+        "//executorch/backends/cadence/fused_quant:op_linear",
+        "//executorch/kernels/test:gtest_utils",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
+
+runtime.cxx_test(
+    name = "test_op_convolution",
+    srcs = ["test_op_convolution.cpp"],
+    platforms = CXX,
+    deps = [
+        "//executorch/backends/cadence/fused_quant:op_convolution",
+        "//executorch/kernels/test:gtest_utils",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+)
diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
new file mode 100644
index 00000000000..93c511a10d5
--- /dev/null
+++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/cadence/fused_quant/op_bmm.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+optional<Tensor> none_tensor() {
+  return optional<Tensor>();
+}
+
+optional<int64_t> none_axis() {
+  return optional<int64_t>();
+}
+
+} // namespace
+
+class FusedQuantBmmTest : public OperatorTest {};
+
+// All quantized: int8 × int8 → int8 (per-tensor)
+TEST_F(FusedQuantBmmTest, AllQuantizedPerTensor) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2,2]: identity matrix {{1,0},{0,1}} quantized as int8
+  // other [1,2,2]: {{1,2},{3,4}} quantized as int8
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // scale=0.5, zp=0: int8 value v maps to v * 0.5
+  // identity: {1,0,0,1} -> int8 {2,0,0,2}
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  // {{1,2},{3,4}} -> int8 {2,4,6,8}
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// float × float → int8
+TEST_F(FusedQuantBmmTest, FloatInputsQuantizedOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // identity
+  Tensor inp = tf_float.make(inp_sizes, {1.0, 0.0, 0.0, 1.0});
+  Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0});
+
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// int8 × int8 → float
+TEST_F(FusedQuantBmmTest, QuantizedInputsFloatOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}
+  // bmm: {{1,2},{3,4}}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make(out_sizes, {1.0, 2.0, 3.0, 4.0}));
+}
+
+// int8 × float → int8
+TEST_F(FusedQuantBmmTest, QuantizedInpFloatOther) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2});
+  Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: {{1,0},{0,1}}
+  // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // requant (scale=0.5, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8}));
+}
+
+// Non-zero zero_point
+TEST_F(FusedQuantBmmTest, NonZeroZeroPoint) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> inp_sizes{1, 2, 2};
+  const std::vector<int> other_sizes{1, 2, 2};
+  const std::vector<int> out_sizes{1, 2, 2};
+
+  // scale=0.25, zp=2: int8 value v maps to (v - 2) * 0.25
+  // inp: {{1,0.5},{0.5,1}} -> int8: (1/0.25)+2=6, (0.5/0.25)+2=4, 4, 6
+  Tensor inp = tf_int8.make(inp_sizes, {6, 4, 4, 6});
+  // other: {{1,2},{0,1}} -> int8: (1/0.25)+2=6, (2/0.25)+2=10, (0/0.25)+2=2,
+  // (1/0.25)+2=6
+  Tensor other = tf_int8.make(other_sizes, {6, 10, 2, 6});
+
+  Tensor inp_scale = tf_float.make({1}, {0.25});
+  Tensor inp_zp = tf_long.make({1}, {2});
+  Tensor other_scale = tf_float.make({1}, {0.25});
+  Tensor other_zp = tf_long.make({1}, {2});
+  // out: scale=0.5, zp=1 -> float f maps to round(f / 0.5) + 1
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {1});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: (6-2)*0.25=1, (4-2)*0.25=0.5, (4-2)*0.25=0.5, (6-2)*0.25=1
+  //   -> {{1, 0.5}, {0.5, 1}}
+  // dequant other: (6-2)*0.25=1, (10-2)*0.25=2, (2-2)*0.25=0, (6-2)*0.25=1
+  //   -> {{1, 2}, {0, 1}}
+  // bmm: {{1*1+0.5*0, 1*2+0.5*1}, {0.5*1+1*0, 0.5*2+1*1}}
+  //    = {{1, 2.5}, {0.5, 2}}
+  // requant (scale=0.5, zp=1):
+  //   round(1/0.5)+1=3, round(2.5/0.5)+1=6,
+  //   round(0.5/0.5)+1=2, round(2/0.5)+1=5
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {3, 6, 2, 5}));
+}
+
+// batch=2, verify both batch elements
+TEST_F(FusedQuantBmmTest, LargerBatch) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [2,2,2]: two identity matrices
+  // other [2,2,2]: batch 0 = {{1,2},{3,4}}, batch 1 = {{5,6},{7,8}}
+  const std::vector<int> inp_sizes{2, 2, 2};
+  const std::vector<int> other_sizes{2, 2, 2};
+  const std::vector<int> out_sizes{2, 2, 2};
+
+  // scale=0.5, zp=0: two identity matrices as int8
+  Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2, 2, 0, 0, 2});
+  Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8, 10, 12, 14, 16});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor other_scale = tf_float.make({1}, {0.5});
+  Tensor other_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(out_sizes);
+
+  // dequant inp: two identity matrices {{1,0},{0,1}}, {{1,0},{0,1}}
+  // dequant other: {{1,2},{3,4}}, {{5,6},{7,8}}
+  // bmm batch 0: I * {{1,2},{3,4}} = {{1,2},{3,4}}
+  // bmm batch 1: I * {{5,6},{7,8}} = {{5,6},{7,8}}
+  // requant (scale=0.5, zp=0): {2,4,6,8, 10,12,14,16}
+  cadence::fused_quant::native::bmm_out(
+      context_,
+      inp,
+      other,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(other_scale),
+      optional<Tensor>(other_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8, 10, 12, 14, 16}));
+}
diff --git a/backends/cadence/fused_quant/tests/test_op_convolution.cpp b/backends/cadence/fused_quant/tests/test_op_convolution.cpp
new file mode 100644
index 00000000000..769fbbffbcb
--- /dev/null
+++ b/backends/cadence/fused_quant/tests/test_op_convolution.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/cadence/fused_quant/op_convolution.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+using executorch::aten::IntArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+optional<Tensor> none_tensor() {
+  return optional<Tensor>();
+}
+
+optional<int64_t> none_axis() {
+  return optional<int64_t>();
+}
+
+} // namespace
+
+class FusedQuantConvolutionTest : public OperatorTest {};
+
+// 1x1 conv, all quantized: int8 inp, int8 weight, no bias, int8 out
+TEST_F(FusedQuantConvolutionTest, Conv1x1AllQuantized) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1, 1, 2, 2] int8
+  Tensor inp = tf_int8.make({1, 1, 2, 2}, {2, 4, 6, 8});
+  // weight [1, 1, 1, 1] int8
+  Tensor weight = tf_int8.make({1, 1, 1, 1}, {2});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  // out [1, 1, 2, 2]
+  Tensor out = tf_int8.zeros({1, 1, 2, 2});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {0, 0};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // dequant inp: {1, 2, 3, 4}
+  // dequant weight: {1}
+  // conv (1x1): {1*1, 2*1, 3*1, 4*1} = {1, 2, 3, 4}
+  // requant (scale=0.5, zp=0): {round(1/0.5), ...} = {2, 4, 6, 8}
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(),
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (none)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/1,
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 1, 2, 2}, {2, 4, 6, 8}));
+}
+
+// 3x3 conv with padding=1, all quantized
+TEST_F(FusedQuantConvolutionTest, Conv3x3WithPadding) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1, 1, 3, 3] int8
+  // After dequant (scale=1, zp=0): same values as int8
+  Tensor inp = tf_int8.make({1, 1, 3, 3}, {0, 0, 0, 0, 1, 0, 0, 0, 0});
+  // weight [1, 1, 3, 3] int8: identity-like (center=1)
+  Tensor weight = tf_int8.make({1, 1, 3, 3}, {0, 0, 0, 0, 1, 0, 0, 0, 0});
+
+  Tensor inp_scale = tf_float.make({1}, {1.0});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor weight_scale = tf_float.make({1}, {1.0});
+  Tensor weight_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  // out [1, 1, 3, 3] (padding=1 preserves spatial dims)
+  Tensor out = tf_int8.zeros({1, 1, 3, 3});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {1, 1};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // dequant inp: {0,0,0, 0,1,0, 0,0,0}
+  // dequant weight: {0,0,0, 0,1,0, 0,0,0} (identity kernel)
+  // conv with padding=1: output equals input (identity convolution)
+  // requant (scale=1, zp=0): same values
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(),
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (none)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/1,
+      out);
+
+  EXPECT_TENSOR_EQ(
+      out, tf_int8.make({1, 1, 3, 3}, {0, 0, 0, 0, 1, 0, 0, 0, 0}));
+}
+
+// float inputs, int8 output
+TEST_F(FusedQuantConvolutionTest, FloatInputsQuantizedOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1, 1, 2, 2] float
+  Tensor inp = tf_float.make({1, 1, 2, 2}, {1.0, 2.0, 3.0, 4.0});
+  // weight [1, 1, 1, 1] float
+  Tensor weight = tf_float.make({1, 1, 1, 1}, {2.0});
+
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros({1, 1, 2, 2});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {0, 0};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // conv (1x1, w=2.0): {2.0, 4.0, 6.0, 8.0}
+  // requant (scale=1.0, zp=0): {2, 4, 6, 8}
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(),
+      // inp qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // weight qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // bias qparams (none)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/1,
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 1, 2, 2}, {2, 4, 6, 8}));
+}
+
+// int8 inputs, float output
+TEST_F(FusedQuantConvolutionTest, QuantizedInputsFloatOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1, 1, 2, 2] int8
+  Tensor inp = tf_int8.make({1, 1, 2, 2}, {2, 4, 6, 8});
+  // weight [1, 1, 1, 1] int8
+  Tensor weight = tf_int8.make({1, 1, 1, 1}, {4});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros({1, 1, 2, 2});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {0, 0};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // dequant inp: {1, 2, 3, 4}
+  // dequant weight: {2}
+  // conv (1x1): {2, 4, 6, 8}
+  // output is float, no requant
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(),
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (none)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/1,
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make({1, 1, 2, 2}, {2.0, 4.0, 6.0, 8.0}));
+}
+
+// Convolution with bias
+TEST_F(FusedQuantConvolutionTest, WithBias) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1, 1, 2, 2] int8
+  Tensor inp = tf_int8.make({1, 1, 2, 2}, {2, 4, 6, 8});
+  // weight [1, 1, 1, 1] int8
+  Tensor weight = tf_int8.make({1, 1, 1, 1}, {2});
+  // bias [1] float (not quantized)
+  Tensor bias = tf_float.make({1}, {10.0});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros({1, 1, 2, 2});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {0, 0};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // dequant inp: {1, 2, 3, 4}
+  // dequant weight: {1}
+  // conv (1x1): {1, 2, 3, 4} + bias 10.0 = {11, 12, 13, 14}
+  // output is float, no requant
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      optional<Tensor>(bias),
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (none, bias is float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/1,
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make({1, 1, 2, 2}, {11.0, 12.0, 13.0, 14.0}));
+}
+
+// Grouped convolution (depthwise: groups == C_in == C_out)
+TEST_F(FusedQuantConvolutionTest, GroupedConvolution) {
+  TensorFactory<ScalarType::Float> tf_float;
+
+  // inp [1, 2, 2, 2] float, 2 input channels
+  Tensor inp = tf_float.make(
+      {1, 2, 2, 2},
+      {1.0,
+       2.0,
+       3.0,
+       4.0, // channel 0
+       5.0,
+       6.0,
+       7.0,
+       8.0}); // channel 1
+
+  // weight [2, 1, 1, 1] float (groups=2, so C_in/groups=1)
+  Tensor weight = tf_float.make({2, 1, 1, 1}, {2.0, 3.0});
+
+  Tensor out = tf_float.zeros({1, 2, 2, 2});
+
+  int64_t stride_arr[] = {1, 1};
+  int64_t padding_arr[] = {0, 0};
+  int64_t dilation_arr[] = {1, 1};
+  IntArrayRef stride(stride_arr, 2);
+  IntArrayRef padding(padding_arr, 2);
+  IntArrayRef dilation(dilation_arr, 2);
+
+  // groups=2, depthwise
+  // channel 0: {1,2,3,4} * 2.0 = {2,4,6,8}
+  // channel 1: {5,6,7,8} * 3.0 = {15,18,21,24}
+  cadence::fused_quant::native::convolution_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(),
+      // inp qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // weight qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // bias qparams (none)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams (none, float)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // conv params
+      stride,
+      padding,
+      dilation,
+      /*groups=*/2,
+      out);
+
+  EXPECT_TENSOR_EQ(
+      out,
+      tf_float.make(
+          {1, 2, 2, 2}, {2.0, 4.0, 6.0, 8.0, 15.0, 18.0, 21.0, 24.0}));
+}
diff --git a/backends/cadence/fused_quant/tests/test_op_hardswish.cpp b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
new file mode 100644
index 00000000000..e92989c64d2
--- /dev/null
+++ b/backends/cadence/fused_quant/tests/test_op_hardswish.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/cadence/fused_quant/op_hardswish.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+optional<Tensor> none_tensor() {
+  return optional<Tensor>();
+}
+
+optional<int64_t> none_axis() {
+  return optional<int64_t>();
+}
+
+} // namespace
+
+class FusedQuantHardswishTest : public OperatorTest {};
+
+// All quantized: int8 → int8 (per-tensor)
+TEST_F(FusedQuantHardswishTest, AllQuantizedPerTensor) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{6};
+
+  Tensor inp = tf_int8.make(sizes, {-6, -3, 0, 3, 6, 10});
+
+  Tensor inp_scale = tf_float.make({1}, {1.0});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // dequant inp: {-6, -3, 0, 3, 6, 10}
+  // hardswish(-6) = -6 * min(max(-3,0),6)/6 = 0
+  // hardswish(-3) = -3 * min(max(0,0),6)/6 = 0
+  // hardswish(0)  = 0 * min(max(3,0),6)/6 = 0
+  // hardswish(3)  = 3 * min(max(6,0),6)/6 = 3
+  // hardswish(6)  = 6 * min(max(9,0),6)/6 = 6
+  // hardswish(10) = 10 * min(max(13,0),6)/6 = 10
+  // requant (scale=1.0, zp=0): {0, 0, 0, 3, 6, 10}
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 3, 6, 10}));
+}
+
+// float → int8
+TEST_F(FusedQuantHardswishTest, FloatInputQuantizedOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{6};
+
+  Tensor inp = tf_float.make(sizes, {-6.0, -3.0, 0.0, 3.0, 6.0, 10.0});
+
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // hardswish: {0, 0, 0, 3, 6, 10}
+  // requant (scale=1.0, zp=0): {0, 0, 0, 3, 6, 10}
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 3, 6, 10}));
+}
+
+// int8 → float
+TEST_F(FusedQuantHardswishTest, QuantizedInputFloatOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{6};
+
+  Tensor inp = tf_int8.make(sizes, {-6, -3, 0, 3, 6, 10});
+
+  Tensor inp_scale = tf_float.make({1}, {1.0});
+  Tensor inp_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros(sizes);
+
+  // dequant inp: {-6, -3, 0, 3, 6, 10}
+  // hardswish: {0.0, 0.0, 0.0, 3.0, 6.0, 10.0}
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {0.0, 0.0, 0.0, 3.0, 6.0, 10.0}));
+}
+
+// Per-channel dequantization on input, per-tensor output
+TEST_F(FusedQuantHardswishTest, PerChannelInput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // Shape [2, 3], axis=0 → 2 channels, axis_stride=3
+  const std::vector<int> sizes{2, 3};
+
+  Tensor inp = tf_int8.make(sizes, {-6, -3, 0, 3, 6, 10});
+
+  // Per-channel: channel 0 scale=1.0, channel 1 scale=0.5
+  Tensor inp_scale = tf_float.make({2}, {1.0, 0.5});
+  Tensor inp_zp = tf_long.make({2}, {0, 0});
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // dequant channel 0 (scale=1.0): {-6, -3, 0}
+  // dequant channel 1 (scale=0.5): {1.5, 3.0, 5.0}
+  // hardswish(-6) = 0, hardswish(-3) = 0, hardswish(0) = 0
+  // hardswish(1.5) = 1.5 * min(max(4.5,0),6)/6 = 1.5*4.5/6 = 1.125
+  // hardswish(3.0) = 3 * min(max(6,0),6)/6 = 3*6/6 = 3.0
+  // hardswish(5.0) = 5 * min(max(8,0),6)/6 = 5*6/6 = 5.0
+  // requant (scale=0.5, zp=0): round(0/0.5)=0, 0, 0,
+  //   round(1.125/0.5)=round(2.25)=2, round(3.0/0.5)=6, round(5.0/0.5)=10
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      optional<int64_t>(0),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 2, 6, 10}));
+}
+
+// Per-channel quantization on output
+TEST_F(FusedQuantHardswishTest, PerChannelOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // Shape [2, 3], axis=0 → 2 channels
+  const std::vector<int> sizes{2, 3};
+
+  Tensor inp = tf_float.make(sizes, {-6.0, 0.0, 3.0, 6.0, 10.0, 12.0});
+
+  // Per-channel output: channel 0 scale=1.0, channel 1 scale=0.5
+  Tensor out_scale = tf_float.make({2}, {1.0, 0.5});
+  Tensor out_zp = tf_long.make({2}, {0, 0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // hardswish(-6) = 0, hardswish(0) = 0, hardswish(3) = 3
+  // hardswish(6) = 6, hardswish(10) = 10, hardswish(12) = 12
+  // requant channel 0 (scale=1.0): round(0/1)=0, round(0/1)=0, round(3/1)=3
+  // requant channel 1 (scale=0.5): round(6/0.5)=12, round(10/0.5)=20,
+  // round(12/0.5)=24
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      optional<int64_t>(0),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 3, 12, 20, 24}));
+}
+
+// Non-zero zero points
+TEST_F(FusedQuantHardswishTest, NonZeroZeroPoint) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{6};
+
+  Tensor inp = tf_int8.make(sizes, {-4, -1, 2, 5, 8, 12});
+
+  // scale=1.0, zp=2 → dequant: (v-2)*1.0
+  Tensor inp_scale = tf_float.make({1}, {1.0});
+  Tensor inp_zp = tf_long.make({1}, {2});
+  // out scale=1.0, zp=1 → requant: round(f/1.0)+1
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {1});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // dequant inp: {-6, -3, 0, 3, 6, 10}
+  // hardswish: {0, 0, 0, 3, 6, 10}
+  // requant (scale=1.0, zp=1): round(0/1)+1=1, 1, 1,
+  //   round(3/1)+1=4, round(6/1)+1=7, round(10/1)+1=11
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {1, 1, 1, 4, 7, 11}));
+}
+
+// All values <= -3 should give 0 (negative saturation region)
+TEST_F(FusedQuantHardswishTest, NegativeRegion) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{4};
+
+  Tensor inp = tf_float.make(sizes, {-10.0, -6.0, -4.0, -3.0});
+
+  Tensor out_scale = tf_float.make({1}, {1.0});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // hardswish(-10) = -10 * min(max(-7,0),6)/6 = 0
+  // hardswish(-6)  = -6 * min(max(-3,0),6)/6 = 0
+  // hardswish(-4)  = -4 * min(max(-1,0),6)/6 = 0
+  // hardswish(-3)  = -3 * min(max(0,0),6)/6 = 0
+  // requant (scale=1.0, zp=0): {0, 0, 0, 0}
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, 0, 0, 0}));
+}
+
+// All values >= 3 should pass through unchanged (linear region)
+TEST_F(FusedQuantHardswishTest, LinearRegion) {
+  TensorFactory<ScalarType::Float> tf_float;
+
+  const std::vector<int> sizes{4};
+
+  Tensor inp = tf_float.make(sizes, {3.0, 4.0, 6.0, 10.0});
+
+  Tensor out = tf_float.zeros(sizes);
+
+  // hardswish(3)  = 3 * min(max(6,0),6)/6 = 3
+  // hardswish(4)  = 4 * min(max(7,0),6)/6 = 4
+  // hardswish(6)  = 6 * min(max(9,0),6)/6 = 6
+  // hardswish(10) = 10 * min(max(13,0),6)/6 = 10
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make(sizes, {3.0, 4.0, 6.0, 10.0}));
+}
+
+// Values between -3 and 3 use the piecewise formula
+TEST_F(FusedQuantHardswishTest, TransitionRegion) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const std::vector<int> sizes{5};
+
+  // int8 input with scale=0.5, zp=0 → float {-3.0, -1.5, 0.0, 1.5, 3.0}
+  Tensor inp = tf_int8.make(sizes, {-6, -3, 0, 3, 6});
+
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+  Tensor out_scale = tf_float.make({1}, {0.125});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros(sizes);
+
+  // dequant: {-3.0, -1.5, 0.0, 1.5, 3.0}
+  // hardswish(-3.0) = -3*min(max(0,0),6)/6 = 0
+  // hardswish(-1.5) = -1.5*min(max(1.5,0),6)/6 = -1.5*1.5/6 = -0.375
+  // hardswish(0)    = 0*min(max(3,0),6)/6 = 0
+  // hardswish(1.5)  = 1.5*min(max(4.5,0),6)/6 = 1.5*4.5/6 = 1.125
+  // hardswish(3.0)  = 3*min(max(6,0),6)/6 = 3*6/6 = 3.0
+  // requant (scale=0.125, zp=0): round(0/0.125)=0, round(-0.375/0.125)=-3,
+  //   round(0/0.125)=0, round(1.125/0.125)=9, round(3.0/0.125)=24
+  cadence::fused_quant::native::hardswish_out(
+      context_,
+      inp,
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make(sizes, {0, -3, 0, 9, 24}));
+}
diff --git a/backends/cadence/fused_quant/tests/test_op_linear.cpp b/backends/cadence/fused_quant/tests/test_op_linear.cpp
new file mode 100644
index 00000000000..ecba8cf7a3e
--- /dev/null
+++ b/backends/cadence/fused_quant/tests/test_op_linear.cpp
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/cadence/fused_quant/op_linear.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+optional<Tensor> none_tensor() {
+  return optional<Tensor>();
+}
+
+optional<int64_t> none_axis() {
+  return optional<int64_t>();
+}
+
+} // namespace
+
+class FusedQuantLinearTest : public OperatorTest {};
+
+// All quantized, no bias: int8 inp + int8 weight -> int8 out
+TEST_F(FusedQuantLinearTest, AllQuantizedNoBias) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0}
+  Tensor inp = tf_int8.make({1, 2}, {2, 4});
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+
+  // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0
+  //   -> float {{1,0},{0,1}} (identity)
+  Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+
+  // out qparams: scale=0.5, zp=0
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros({1, 2});
+
+  // linear: {1,2} @ identity = {1,2}
+  // requant (scale=0.5, zp=0): {round(1/0.5), round(2/0.5)} = {2, 4}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (unused, no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {2, 4}));
+}
+
+// All quantized with bias: int8 inp + int8 weight + int8 bias -> int8 out
+TEST_F(FusedQuantLinearTest, AllQuantizedWithBias) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0}
+  Tensor inp = tf_int8.make({1, 2}, {2, 4});
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+
+  // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0
+  //   -> float {{1,0},{0,1}} (identity)
+  Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+
+  // bias [2]: int8 {2,2}, scale=0.5, zp=0 -> float {1.0, 1.0}
+  Tensor bias = tf_int8.make({2}, {2, 2});
+  Tensor bias_scale = tf_float.make({1}, {0.5});
+  Tensor bias_zp = tf_long.make({1}, {0});
+
+  // out qparams: scale=0.5, zp=0
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros({1, 2});
+
+  // linear: {1,2} @ identity + {1,1} = {2, 3}
+  // requant (scale=0.5, zp=0): {round(2/0.5), round(3/0.5)} = {4, 6}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      optional<Tensor>(bias),
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams
+      optional<Tensor>(bias_scale),
+      optional<Tensor>(bias_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {4, 6}));
+}
+
+// Float inputs -> int8 output
+TEST_F(FusedQuantLinearTest, FloatInputsQuantizedOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: float {1.0, 2.0}
+  Tensor inp = tf_float.make({1, 2}, {1.0, 2.0});
+
+  // weight [2,2]: float identity {{1,0},{0,1}}
+  Tensor weight = tf_float.make({2, 2}, {1.0, 0.0, 0.0, 1.0});
+
+  // out qparams: scale=0.5, zp=0
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros({1, 2});
+
+  // linear: {1,2} @ identity = {1, 2}
+  // requant (scale=0.5, zp=0): {2, 4}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // weight qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // bias qparams (no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {2, 4}));
+}
+
+// int8 inputs -> float output
+TEST_F(FusedQuantLinearTest, QuantizedInputsFloatOutput) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0}
+  Tensor inp = tf_int8.make({1, 2}, {2, 4});
+  Tensor inp_scale = tf_float.make({1}, {0.5});
+  Tensor inp_zp = tf_long.make({1}, {0});
+
+  // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0 -> identity
+  Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2});
+  Tensor weight_scale = tf_float.make({1}, {0.5});
+  Tensor weight_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_float.zeros({1, 2});
+
+  // linear: {1,2} @ identity = {1.0, 2.0}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams (float, not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make({1, 2}, {1.0, 2.0}));
+}
+
+// Per-channel quantized weights (axis=0)
+TEST_F(FusedQuantLinearTest, PerChannelWeights) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: float {1.0, 2.0}
+  Tensor inp = tf_float.make({1, 2}, {1.0, 2.0});
+
+  // weight [2,2]: int8 {2,4,3,6}, per-channel axis=0
+  //   ch0 scale=0.5: {(2-0)*0.5, (4-0)*0.5} = {1.0, 2.0}
+  //   ch1 scale=1.0: {(3-0)*1.0, (6-0)*1.0} = {3.0, 6.0}
+  Tensor weight = tf_int8.make({2, 2}, {2, 4, 3, 6});
+  Tensor weight_scale = tf_float.make({2}, {0.5, 1.0});
+  Tensor weight_zp = tf_long.make({2}, {0, 0});
+
+  // out qparams: scale=0.5, zp=0
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {0});
+
+  Tensor out = tf_int8.zeros({1, 2});
+
+  // linear: out[0] = 1*1 + 2*2 = 5, out[1] = 1*3 + 2*6 = 15
+  // requant (scale=0.5, zp=0): {round(5/0.5), round(15/0.5)} = {10, 30}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // weight qparams (per-channel, axis=0)
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      optional<int64_t>(0),
+      // bias qparams (no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {10, 30}));
+}
+
+// Batched input: inp [2,2]
+TEST_F(FusedQuantLinearTest, BatchedInput) {
+  TensorFactory<ScalarType::Float> tf_float;
+
+  // inp [2,2]: float, 2 batch rows
+  Tensor inp = tf_float.make({2, 2}, {1.0, 2.0, 3.0, 4.0});
+
+  // weight [2,2]: float identity
+  Tensor weight = tf_float.make({2, 2}, {1.0, 0.0, 0.0, 1.0});
+
+  Tensor out = tf_float.zeros({2, 2});
+
+  // linear row0: {1,2} @ identity = {1, 2}
+  // linear row1: {3,4} @ identity = {3, 4}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // weight qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // bias qparams (no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams (not quantized)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_float.make({2, 2}, {1.0, 2.0, 3.0, 4.0}));
+}
+
+// Non-zero zero points
+TEST_F(FusedQuantLinearTest, NonZeroZeroPoint) {
+  TensorFactory<ScalarType::Char> tf_int8;
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // inp [1,2]: int8 {6,8}, scale=0.25, zp=2
+  //   dequant: {(6-2)*0.25, (8-2)*0.25} = {1.0, 1.5}
+  Tensor inp = tf_int8.make({1, 2}, {6, 8});
+  Tensor inp_scale = tf_float.make({1}, {0.25});
+  Tensor inp_zp = tf_long.make({1}, {2});
+
+  // weight [2,2]: int8 {6,2,2,6}, scale=0.25, zp=2
+  //   dequant: {(6-2)*0.25, (2-2)*0.25, (2-2)*0.25, (6-2)*0.25}
+  //          = {1.0, 0.0, 0.0, 1.0} (identity)
+  Tensor weight = tf_int8.make({2, 2}, {6, 2, 2, 6});
+  Tensor weight_scale = tf_float.make({1}, {0.25});
+  Tensor weight_zp = tf_long.make({1}, {2});
+
+  // out: scale=0.5, zp=1
+  Tensor out_scale = tf_float.make({1}, {0.5});
+  Tensor out_zp = tf_long.make({1}, {1});
+
+  Tensor out = tf_int8.zeros({1, 2});
+
+  // linear: {1.0, 1.5} @ identity = {1.0, 1.5}
+  // requant (scale=0.5, zp=1): {round(1.0/0.5)+1, round(1.5/0.5)+1} = {3, 4}
+  cadence::fused_quant::native::linear_out(
+      context_,
+      inp,
+      weight,
+      none_tensor(), // no bias
+      // inp qparams
+      optional<Tensor>(inp_scale),
+      optional<Tensor>(inp_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // weight qparams
+      optional<Tensor>(weight_scale),
+      optional<Tensor>(weight_zp),
+      ScalarType::Float,
+      -128,
+      127,
+      none_axis(),
+      // bias qparams (no bias)
+      none_tensor(),
+      none_tensor(),
+      ScalarType::Float,
+      0,
+      0,
+      none_axis(),
+      // out qparams
+      optional<Tensor>(out_scale),
+      optional<Tensor>(out_zp),
+      ScalarType::Char,
+      -128,
+      127,
+      none_axis(),
+      out);
+
+  EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {3, 4}));
+}