diff --git a/backends/cadence/fused_quant/op_bmm.cpp b/backends/cadence/fused_quant/op_bmm.cpp new file mode 100644 index 00000000000..2c79bcb6a59 --- /dev/null +++ b/backends/cadence/fused_quant/op_bmm.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace cadence { +namespace fused_quant { +namespace native { + +using executorch::aten::optional; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +namespace { + +void bmm_kernel( + const float* inp, + const float* other, + float* out, + int64_t batch, + int64_t M, + int64_t K, + int64_t N) { + for (int64_t b = 0; b < batch; ++b) { + for (int64_t m = 0; m < M; ++m) { + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + sum += inp[b * M * K + m * K + k] * other[b * K * N + k * N + n]; + } + out[b * M * N + m * N + n] = sum; + } + } + } +} + +} // namespace + +Tensor& bmm_out( + KernelRuntimeContext& ctx, + const Tensor& inp, + const Tensor& other, + const optional& inp_scale, + const optional& inp_zero_point, + ScalarType inp_dtype, + int64_t inp_quant_min, + int64_t inp_quant_max, + optional inp_axis, + const optional& other_scale, + const optional& other_zero_point, + ScalarType other_dtype, + int64_t other_quant_min, + int64_t other_quant_max, + optional other_axis, + const optional& out_scale, + const optional& out_zero_point, + ScalarType out_dtype, + int64_t out_quant_min, + int64_t out_quant_max, + optional out_axis, + Tensor& out) { + int64_t batch = inp.size(0); + int64_t M = inp.size(1); + int64_t K = inp.size(2); + int64_t N = other.size(2); + int64_t inp_numel = inp.numel(); + int64_t other_numel = other.numel(); + int64_t out_numel = batch * M * N; + + bool inp_quantized = inp_scale.has_value(); + bool other_quantized = other_scale.has_value(); + bool out_quantized = out_scale.has_value(); + + // Dequantize inp + std::vector inp_buf; + const float* const inp_float = [&]() -> const float* { + if (!inp_quantized) { + return inp.const_data_ptr(); + } + inp_buf.resize(inp_numel); + QParams qp = extract_qparams( + inp_scale, inp_zero_point, inp_quant_min, inp_quant_max, inp_axis, inp); + FUSED_QUANT_DTYPE_SWITCH( + inp.scalar_type(), + scalar_t, + dequantize_buffer( + inp.const_data_ptr(), inp_buf.data(), inp_numel, qp);) + return inp_buf.data(); + }(); + + // Dequantize other + std::vector other_buf; + const float* const other_float = [&]() -> const float* { + if (!other_quantized) { + return other.const_data_ptr(); + } + other_buf.resize(other_numel); + QParams qp = extract_qparams( + other_scale, + other_zero_point, + other_quant_min, + other_quant_max, + other_axis, + other); + FUSED_QUANT_DTYPE_SWITCH(other.scalar_type(), + scalar_t, + dequantize_buffer( + other.const_data_ptr(), + other_buf.data(), + other_numel, + qp);) + return other_buf.data(); + }(); + + // BMM in float, then optionally quantize output + if (out_quantized) { + std::vector result_float(out_numel); + bmm_kernel(inp_float, other_float, result_float.data(), batch, M, K, N); + + QParams qp = extract_qparams( + out_scale, out_zero_point, out_quant_min, out_quant_max, out_axis, out); + FUSED_QUANT_DTYPE_SWITCH(out.scalar_type(), + scalar_t, + quantize_buffer( + result_float.data(), + out.mutable_data_ptr(), + out_numel, + qp);) + } else { + bmm_kernel( + inp_float, other_float, out.mutable_data_ptr(), batch, M, K, N); + } + + return out; +} + +} // namespace native +} // namespace fused_quant +} // namespace cadence diff --git a/backends/cadence/fused_quant/op_bmm.h b/backends/cadence/fused_quant/op_bmm.h new file mode 100644 index 00000000000..f814b46b481 --- /dev/null +++ b/backends/cadence/fused_quant/op_bmm.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace cadence { +namespace fused_quant { +namespace native { + +executorch::aten::Tensor& bmm_out( + executorch::runtime::KernelRuntimeContext& ctx, + const executorch::aten::Tensor& inp, + const executorch::aten::Tensor& other, + const executorch::aten::optional& inp_scale, + const executorch::aten::optional& inp_zero_point, + executorch::aten::ScalarType inp_dtype, + int64_t inp_quant_min, + int64_t inp_quant_max, + executorch::aten::optional inp_axis, + const executorch::aten::optional& other_scale, + const executorch::aten::optional& + other_zero_point, + executorch::aten::ScalarType other_dtype, + int64_t other_quant_min, + int64_t other_quant_max, + executorch::aten::optional other_axis, + const executorch::aten::optional& out_scale, + const executorch::aten::optional& out_zero_point, + executorch::aten::ScalarType out_dtype, + int64_t out_quant_min, + int64_t out_quant_max, + executorch::aten::optional out_axis, + executorch::aten::Tensor& out); + +} // namespace native +} // namespace fused_quant +} // namespace cadence diff --git a/backends/cadence/fused_quant/targets.bzl b/backends/cadence/fused_quant/targets.bzl index 902d4d2727f..f98a357ae90 100644 --- a/backends/cadence/fused_quant/targets.bzl +++ b/backends/cadence/fused_quant/targets.bzl @@ -58,3 +58,15 @@ def define_common_targets(): ], visibility = ["PUBLIC"], ) + + runtime.cxx_library( + name = "op_bmm", + srcs = ["op_bmm.cpp"], + exported_headers = ["op_bmm.h"], + platforms = CXX, + deps = [ + ":quant_utils", + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["PUBLIC"], + ) diff --git a/backends/cadence/fused_quant/tests/BUCK b/backends/cadence/fused_quant/tests/BUCK index c503049dc69..90b3af0aa45 100644 --- a/backends/cadence/fused_quant/tests/BUCK +++ b/backends/cadence/fused_quant/tests/BUCK @@ -46,3 +46,14 @@ runtime.cxx_test( "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], ) + +runtime.cxx_test( + name = "test_op_bmm", + srcs = ["test_op_bmm.cpp"], + platforms = CXX, + deps = [ + "//executorch/backends/cadence/fused_quant:op_bmm", + "//executorch/kernels/test:gtest_utils", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ], +) diff --git a/backends/cadence/fused_quant/tests/test_op_bmm.cpp b/backends/cadence/fused_quant/tests/test_op_bmm.cpp new file mode 100644 index 00000000000..93c511a10d5 --- /dev/null +++ b/backends/cadence/fused_quant/tests/test_op_bmm.cpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +using executorch::aten::optional; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::testing::TensorFactory; + +namespace { + +optional none_tensor() { + return optional(); +} + +optional none_axis() { + return optional(); +} + +} // namespace + +class FusedQuantBmmTest : public OperatorTest {}; + +// All quantized: int8 × int8 → int8 (per-tensor) +TEST_F(FusedQuantBmmTest, AllQuantizedPerTensor) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [1,2,2]: identity matrix {{1,0},{0,1}} quantized as int8 + // other [1,2,2]: {{1,2},{3,4}} quantized as int8 + const std::vector inp_sizes{1, 2, 2}; + const std::vector other_sizes{1, 2, 2}; + const std::vector out_sizes{1, 2, 2}; + + // scale=0.5, zp=0: int8 value v maps to v * 0.5 + // identity: {1,0,0,1} -> int8 {2,0,0,2} + Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2}); + // {{1,2},{3,4}} -> int8 {2,4,6,8} + Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8}); + + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + Tensor other_scale = tf_float.make({1}, {0.5}); + Tensor other_zp = tf_long.make({1}, {0}); + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros(out_sizes); + + // dequant inp: {{1,0},{0,1}} + // dequant other: {{1,2},{3,4}} + // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}} + // requant (scale=0.5, zp=0): {2, 4, 6, 8} + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(other_scale), + optional(other_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8})); +} + +// float × float → int8 +TEST_F(FusedQuantBmmTest, FloatInputsQuantizedOutput) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + const std::vector inp_sizes{1, 2, 2}; + const std::vector other_sizes{1, 2, 2}; + const std::vector out_sizes{1, 2, 2}; + + // identity + Tensor inp = tf_float.make(inp_sizes, {1.0, 0.0, 0.0, 1.0}); + Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0}); + + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros(out_sizes); + + // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}} + // requant (scale=0.5, zp=0): {2, 4, 6, 8} + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8})); +} + +// int8 × int8 → float +TEST_F(FusedQuantBmmTest, QuantizedInputsFloatOutput) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + const std::vector inp_sizes{1, 2, 2}; + const std::vector other_sizes{1, 2, 2}; + const std::vector out_sizes{1, 2, 2}; + + Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2}); + Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8}); + + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + Tensor other_scale = tf_float.make({1}, {0.5}); + Tensor other_zp = tf_long.make({1}, {0}); + + Tensor out = tf_float.zeros(out_sizes); + + // dequant inp: {{1,0},{0,1}} + // dequant other: {{1,2},{3,4}} + // bmm: {{1,2},{3,4}} + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(other_scale), + optional(other_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_float.make(out_sizes, {1.0, 2.0, 3.0, 4.0})); +} + +// int8 × float → int8 +TEST_F(FusedQuantBmmTest, QuantizedInpFloatOther) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + const std::vector inp_sizes{1, 2, 2}; + const std::vector other_sizes{1, 2, 2}; + const std::vector out_sizes{1, 2, 2}; + + Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2}); + Tensor other = tf_float.make(other_sizes, {1.0, 2.0, 3.0, 4.0}); + + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros(out_sizes); + + // dequant inp: {{1,0},{0,1}} + // bmm: I * {{1,2},{3,4}} = {{1,2},{3,4}} + // requant (scale=0.5, zp=0): {2, 4, 6, 8} + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + none_tensor(), + none_tensor(), + ScalarType::Float, + 0, + 0, + none_axis(), + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8})); +} + +// Non-zero zero_point +TEST_F(FusedQuantBmmTest, NonZeroZeroPoint) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + const std::vector inp_sizes{1, 2, 2}; + const std::vector other_sizes{1, 2, 2}; + const std::vector out_sizes{1, 2, 2}; + + // scale=0.25, zp=2: int8 value v maps to (v - 2) * 0.25 + // inp: {{1,0.5},{0.5,1}} -> int8: (1/0.25)+2=6, (0.5/0.25)+2=4, 4, 6 + Tensor inp = tf_int8.make(inp_sizes, {6, 4, 4, 6}); + // other: {{1,2},{0,1}} -> int8: (1/0.25)+2=6, (2/0.25)+2=10, (0/0.25)+2=2, + // (1/0.25)+2=6 + Tensor other = tf_int8.make(other_sizes, {6, 10, 2, 6}); + + Tensor inp_scale = tf_float.make({1}, {0.25}); + Tensor inp_zp = tf_long.make({1}, {2}); + Tensor other_scale = tf_float.make({1}, {0.25}); + Tensor other_zp = tf_long.make({1}, {2}); + // out: scale=0.5, zp=1 -> float f maps to round(f / 0.5) + 1 + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {1}); + + Tensor out = tf_int8.zeros(out_sizes); + + // dequant inp: (6-2)*0.25=1, (4-2)*0.25=0.5, (4-2)*0.25=0.5, (6-2)*0.25=1 + // -> {{1, 0.5}, {0.5, 1}} + // dequant other: (6-2)*0.25=1, (10-2)*0.25=2, (2-2)*0.25=0, (6-2)*0.25=1 + // -> {{1, 2}, {0, 1}} + // bmm: {{1*1+0.5*0, 1*2+0.5*1}, {0.5*1+1*0, 0.5*2+1*1}} + // = {{1, 2.5}, {0.5, 2}} + // requant (scale=0.5, zp=1): + // round(1/0.5)+1=3, round(2.5/0.5)+1=6, + // round(0.5/0.5)+1=2, round(2/0.5)+1=5 + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(other_scale), + optional(other_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {3, 6, 2, 5})); +} + +// batch=2, verify both batch elements +TEST_F(FusedQuantBmmTest, LargerBatch) { + TensorFactory tf_int8; + TensorFactory tf_float; + TensorFactory tf_long; + + // inp [2,2,2]: two identity matrices + // other [2,2,2]: batch 0 = {{1,2},{3,4}}, batch 1 = {{5,6},{7,8}} + const std::vector inp_sizes{2, 2, 2}; + const std::vector other_sizes{2, 2, 2}; + const std::vector out_sizes{2, 2, 2}; + + // scale=0.5, zp=0: two identity matrices as int8 + Tensor inp = tf_int8.make(inp_sizes, {2, 0, 0, 2, 2, 0, 0, 2}); + Tensor other = tf_int8.make(other_sizes, {2, 4, 6, 8, 10, 12, 14, 16}); + + Tensor inp_scale = tf_float.make({1}, {0.5}); + Tensor inp_zp = tf_long.make({1}, {0}); + Tensor other_scale = tf_float.make({1}, {0.5}); + Tensor other_zp = tf_long.make({1}, {0}); + Tensor out_scale = tf_float.make({1}, {0.5}); + Tensor out_zp = tf_long.make({1}, {0}); + + Tensor out = tf_int8.zeros(out_sizes); + + // dequant inp: two identity matrices {{1,0},{0,1}}, {{1,0},{0,1}} + // dequant other: {{1,2},{3,4}}, {{5,6},{7,8}} + // bmm batch 0: I * {{1,2},{3,4}} = {{1,2},{3,4}} + // bmm batch 1: I * {{5,6},{7,8}} = {{5,6},{7,8}} + // requant (scale=0.5, zp=0): {2,4,6,8, 10,12,14,16} + cadence::fused_quant::native::bmm_out( + context_, + inp, + other, + optional(inp_scale), + optional(inp_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(other_scale), + optional(other_zp), + ScalarType::Float, + -128, + 127, + none_axis(), + optional(out_scale), + optional(out_zp), + ScalarType::Char, + -128, + 127, + none_axis(), + out); + + EXPECT_TENSOR_EQ(out, tf_int8.make(out_sizes, {2, 4, 6, 8, 10, 12, 14, 16})); +}