diff --git a/backends/cadence/fused_quant/op_linear.cpp b/backends/cadence/fused_quant/op_linear.cpp new file mode 100644 index 00000000000..be846fd5ede --- /dev/null +++ b/backends/cadence/fused_quant/op_linear.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace cadence { +namespace fused_quant { +namespace native { + +using executorch::aten::optional; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +namespace { + +void linear_kernel( + const float* inp, + const float* weight, + const float* bias, + float* out, + int64_t num_rows, + int64_t in_features, + int64_t out_features) { + for (int64_t r = 0; r < num_rows; ++r) { + for (int64_t o = 0; o < out_features; ++o) { + float sum = bias ? bias[o] : 0.0f; + for (int64_t i = 0; i < in_features; ++i) { + sum += inp[r * in_features + i] * weight[o * in_features + i]; + } + out[r * out_features + o] = sum; + } + } +} + +} // namespace + +Tensor& linear_out( + KernelRuntimeContext& ctx, + const Tensor& inp, + const Tensor& weight, + const optional& bias, + // inp qparams + const optional& inp_scale, + const optional& inp_zero_point, + ScalarType inp_dtype, + int64_t inp_quant_min, + int64_t inp_quant_max, + optional inp_axis, + // weight qparams + const optional& weight_scale, + const optional& weight_zero_point, + ScalarType weight_dtype, + int64_t weight_quant_min, + int64_t weight_quant_max, + optional weight_axis, + // bias qparams + const optional& bias_scale, + const optional& bias_zero_point, + ScalarType bias_dtype, + int64_t bias_quant_min, + int64_t bias_quant_max, + optional bias_axis, + // out qparams + const optional& out_scale, + const optional& out_zero_point, + ScalarType out_dtype, + int64_t out_quant_min, + int64_t out_quant_max, + optional out_axis, + Tensor& out) { + int64_t in_features = inp.size(inp.dim() - 1); + int64_t out_features = weight.size(0); + int64_t num_rows = inp.numel() / in_features; + int64_t inp_numel = inp.numel(); + int64_t weight_numel = weight.numel(); + int64_t out_numel = num_rows * out_features; + + bool inp_quantized = inp_scale.has_value(); + bool weight_quantized = weight_scale.has_value(); + bool bias_quantized = bias_scale.has_value(); + bool out_quantized = out_scale.has_value(); + + // Dequantize inp + std::vector inp_buf; + const float* const inp_float = [&]() -> const float* { + if (!inp_quantized) { + return inp.const_data_ptr(); + } + inp_buf.resize(inp_numel); + QParams qp = extract_qparams( + inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp); + FUSED_QUANT_DTYPE_SWITCH( + inp.scalar_type(), + scalar_t, + dequantize_buffer( + inp.const_data_ptr(), inp_buf.data(), inp_numel, qp);) + return inp_buf.data(); + }(); + + // Dequantize weight + std::vector weight_buf; + const float* const weight_float = [&]() -> const float* { + if (!weight_quantized) { + return weight.const_data_ptr(); + } + weight_buf.resize(weight_numel); + QParams qp = extract_qparams( + weight_scale, + weight_zero_point, + weight_quant_min, + weight_quant_max, + weight_axis, + weight); + FUSED_QUANT_DTYPE_SWITCH(weight.scalar_type(), + scalar_t, + dequantize_buffer( + weight.const_data_ptr(), + weight_buf.data(), + weight_numel, + qp);) + return weight_buf.data(); + }(); + + // Dequantize bias if present and quantized + std::vector bias_buf; + const float* const bias_float = [&]() -> const float* { + if (!bias.has_value()) { + return nullptr; + } + const Tensor& b = bias.value(); + if (!bias_quantized) { + return b.const_data_ptr(); + } + int64_t bias_numel = b.numel(); + bias_buf.resize(bias_numel); + QParams qp = extract_qparams( + bias_scale, + bias_zero_point, + bias_quant_min, + bias_quant_max, + bias_axis, + b); + FUSED_QUANT_DTYPE_SWITCH( + b.scalar_type(), + scalar_t, + dequantize_buffer( + b.const_data_ptr(), bias_buf.data(), bias_numel, qp);) + return bias_buf.data(); + }(); + + // Linear + optional quantize + if (out_quantized) { + std::vector result_float(out_numel); + linear_kernel( + inp_float, + weight_float, + bias_float, + result_float.data(), + num_rows, + in_features, + out_features); + QParams qp = extract_qparams( + out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out); + FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(), + scalar_t, + quantize_buffer( + result_float.data(), + out.mutable_data_ptr(), + out_numel, + qp);) + } else { + linear_kernel( + inp_float, + weight_float, + bias_float, + out.mutable_data_ptr(), + num_rows, + in_features, + out_features); + } + + return out; +} + +} // namespace native +} // namespace fused_quant +} // namespace cadence diff --git a/backends/cadence/fused_quant/op_linear.h b/backends/cadence/fused_quant/op_linear.h new file mode 100644 index 00000000000..99d20ba5bbc --- /dev/null +++ b/backends/cadence/fused_quant/op_linear.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace cadence { +namespace fused_quant { +namespace native { + +executorch::aten::Tensor& linear_out( + executorch::runtime::KernelRuntimeContext& ctx, + const executorch::aten::Tensor& inp, + const executorch::aten::Tensor& weight, + const executorch::aten::optional& bias, + // inp qparams + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, + executorch::aten::ScalarType inp_dtype, + int64_t inp_quant_min, + int64_t inp_quant_max, + executorch::aten::optional inp_axis, + // weight qparams + const executorch::aten::optional& weight_scale, + const executorch::aten::optional& + weight_zero_point, + executorch::aten::ScalarType weight_dtype, + int64_t weight_quant_min, + int64_t weight_quant_max, + executorch::aten::optional weight_axis, + // bias qparams + const executorch::aten::optional& bias_scale, + const executorch::aten::optional& bias_zero_point, + executorch::aten::ScalarType bias_dtype, + int64_t bias_quant_min, + int64_t bias_quant_max, + executorch::aten::optional bias_axis, + // out qparams + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, + executorch::aten::ScalarType out_dtype, + int64_t out_quant_min, + int64_t out_quant_max, + executorch::aten::optional out_axis, + executorch::aten::Tensor& out); + +} // namespace native +} // namespace fused_quant +} // namespace cadence diff --git a/backends/cadence/fused_quant/targets.bzl b/backends/cadence/fused_quant/targets.bzl index f98a357ae90..2b0a82e623f 100644 --- a/backends/cadence/fused_quant/targets.bzl +++ b/backends/cadence/fused_quant/targets.bzl @@ -70,3 +70,15 @@ def define_common_targets(): ], visibility = ["PUBLIC"], ) + + runtime.cxx_library( + name = "op_linear", + srcs = ["op_linear.cpp"], + exported_headers = ["op_linear.h"], + platforms = CXX, + deps = [ + ":quant_utils", + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["PUBLIC"], + ) diff --git a/backends/cadence/fused_quant/tests/BUCK b/backends/cadence/fused_quant/tests/BUCK index 90b3af0aa45..6f085e26202 100644 --- a/backends/cadence/fused_quant/tests/BUCK +++ b/backends/cadence/fused_quant/tests/BUCK @@ -57,3 +57,14 @@ runtime.cxx_test( "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], ) + +runtime.cxx_test( + name = "test_op_linear", + srcs = ["test_op_linear.cpp"], + platforms = CXX, + deps = [ + "//executorch/backends/cadence/fused_quant:op_linear", + "//executorch/kernels/test:gtest_utils", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ], +) diff --git a/backends/cadence/fused_quant/tests/test_op_linear.cpp b/backends/cadence/fused_quant/tests/test_op_linear.cpp new file mode 100644 index 00000000000..ecba8cf7a3e --- /dev/null +++ b/backends/cadence/fused_quant/tests/test_op_linear.cpp @@ -0,0 +1,459 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +using executorch::aten::optional; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::testing::TensorFactory; + +namespace { + +optional none_tensor() { + return optional(); +} + +optional none_axis() { + return optional(); +} + +} // namespace + +class FusedQuantLinearTest : public OperatorTest {}; + +// All quantized, no bias: int8 inp + int8 weight -> int8 out +TEST_F(FusedQuantLinearTest, AllQuantizedNoBias) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0} + Tensor inp = tf_int8.make({1, 2}, {2, 4}); + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + + // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0 + // -> float {{1,0},{0,1}} (identity) + Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2}); + Tensor weight_scale = tf_float.make({1}, {0.5}); + Tensor weight_zp = tf_long.make({1}, {0}); + + // out qparams: scale=0.5, zp=0 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros({1, 2}); + + // linear: {1,2} @ identity = {1,2} + // requant (scale=0.5, zp=0): {round(1/0.5), round(2/0.5)} = {2, 4} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // weight qparams + optional(weight_scale), + optional(weight_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // bias qparams (unused, no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {2, 4})); +} + +// All quantized with bias: int8 inp + int8 weight + int8 bias -> int8 out +TEST_F(FusedQuantLinearTest, AllQuantizedWithBias) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0} + Tensor inp = tf_int8.make({1, 2}, {2, 4}); + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + + // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0 + // -> float {{1,0},{0,1}} (identity) + Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2}); + Tensor weight_scale = tf_float.make({1}, {0.5}); + Tensor weight_zp = tf_long.make({1}, {0}); + + // bias [2]: int8 {2,2}, scale=0.5, zp=0 -> float {1.0, 1.0} + Tensor bias = tf_int8.make({2}, {2, 2}); + Tensor bias_scale = tf_float.make({1}, {0.5}); + Tensor bias_zp = tf_long.make({1}, {0}); + + // out qparams: scale=0.5, zp=0 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros({1, 2}); + + // linear: {1,2} @ identity + {1,1} = {2, 3} + // requant (scale=0.5, zp=0): {round(2/0.5), round(3/0.5)} = {4, 6} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + optional(bias), + // inp qparams + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // weight qparams + optional(weight_scale), + optional(weight_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // bias qparams + optional(bias_scale), + optional(bias_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // out qparams + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {4, 6})); +} + +// Float inputs -> int8 output +TEST_F(FusedQuantLinearTest, FloatInputsQuantizedOutput) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: float {1.0, 2.0} + Tensor inp = tf_float.make({1, 2}, {1.0, 2.0}); + + // weight [2,2]: float identity {{1,0},{0,1}} + Tensor weight = tf_float.make({2, 2}, {1.0, 0.0, 0.0, 1.0}); + + // out qparams: scale=0.5, zp=0 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros({1, 2}); + + // linear: {1,2} @ identity = {1, 2} + // requant (scale=0.5, zp=0): {2, 4} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // weight qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // bias qparams (no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {2, 4})); +} + +// int8 inputs -> float output +TEST_F(FusedQuantLinearTest, QuantizedInputsFloatOutput) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: int8 {2,4}, scale=0.5, zp=0 -> float {1.0, 2.0} + Tensor inp = tf_int8.make({1, 2}, {2, 4}); + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + + // weight [2,2]: int8 {2,0,0,2}, scale=0.5, zp=0 -> identity + Tensor weight = tf_int8.make({2, 2}, {2, 0, 0, 2}); + Tensor weight_scale = tf_float.make({1}, {0.5}); + Tensor weight_zp = tf_long.make({1}, {0}); + + Tensor out = tf_float.zeros({1, 2}); + + // linear: {1,2} @ identity = {1.0, 2.0} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // weight qparams + optional(weight_scale), + optional(weight_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // bias qparams (no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams (float, not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_float.make({1, 2}, {1.0, 2.0})); +} + +// Per-channel quantized weights (axis=0) +TEST_F(FusedQuantLinearTest, PerChannelWeights) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: float {1.0, 2.0} + Tensor inp = tf_float.make({1, 2}, {1.0, 2.0}); + + // weight [2,2]: int8 {2,4,3,6}, per-channel axis=0 + // ch0 scale=0.5: {(2-0)*0.5, (4-0)*0.5} = {1.0, 2.0} + // ch1 scale=1.0: {(3-0)*1.0, (6-0)*1.0} = {3.0, 6.0} + Tensor weight = tf_int8.make({2, 2}, {2, 4, 3, 6}); + Tensor weight_scale = tf_float.make({2}, {0.5, 1.0}); + Tensor weight_zp = tf_long.make({2}, {0, 0}); + + // out qparams: scale=0.5, zp=0 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros({1, 2}); + + // linear: out[0] = 1*1 + 2*2 = 5, out[1] = 1*3 + 2*6 = 15 + // requant (scale=0.5, zp=0): {round(5/0.5), round(15/0.5)} = {10, 30} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // weight qparams (per-channel, axis=0) + optional(weight_scale), + optional(weight_zp), + ScalarType::Float, + -128, + 127, + optional(0), + // bias qparams (no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {10, 30})); +} + +// Batched input: inp [2,2] +TEST_F(FusedQuantLinearTest, BatchedInput) { + TensorFactory tf_float; + + // inp [2,2]: float, 2 batch rows + Tensor inp = tf_float.make({2, 2}, {1.0, 2.0, 3.0, 4.0}); + + // weight [2,2]: float identity + Tensor weight = tf_float.make({2, 2}, {1.0, 0.0, 0.0, 1.0}); + + Tensor out = tf_float.zeros({2, 2}); + + // linear row0: {1,2} @ identity = {1, 2} + // linear row1: {3,4} @ identity = {3, 4} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // weight qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // bias qparams (no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams (not quantized) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_float.make({2, 2}, {1.0, 2.0, 3.0, 4.0})); +} + +// Non-zero zero points +TEST_F(FusedQuantLinearTest, NonZeroZeroPoint) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2]: int8 {6,8}, scale=0.25, zp=2 + // dequant: {(6-2)*0.25, (8-2)*0.25} = {1.0, 1.5} + Tensor inp = tf_int8.make({1, 2}, {6, 8}); + Tensor inp_scale = tf_float.make({1}, {0.25}); + Tensor inp_zp = tf_long.make({1}, {2}); + + // weight [2,2]: int8 {6,2,2,6}, scale=0.25, zp=2 + // dequant: {(6-2)*0.25, (2-2)*0.25, (2-2)*0.25, (6-2)*0.25} + // = {1.0, 0.0, 0.0, 1.0} (identity) + Tensor weight = tf_int8.make({2, 2}, {6, 2, 2, 6}); + Tensor weight_scale = tf_float.make({1}, {0.25}); + Tensor weight_zp = tf_long.make({1}, {2}); + + // out: scale=0.5, zp=1 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {1}); + + Tensor out = tf_int8.zeros({1, 2}); + + // linear: {1.0, 1.5} @ identity = {1.0, 1.5} + // requant (scale=0.5, zp=1): {round(1.0/0.5)+1, round(1.5/0.5)+1} = {3, 4} + cadence::fused_quant::native::linear_out( + context_, + inp, + weight, + none_tensor(), // no bias + // inp qparams + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // weight qparams + optional(weight_scale), + optional(weight_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + // bias qparams (no bias) + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + // out qparams + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make({1, 2}, {3, 4})); +}