diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt index aee62c82f4..ecddcd07f0 100644 --- a/host/lib/convert/CMakeLists.txt +++ b/host/lib/convert/CMakeLists.txt @@ -1,6 +1,7 @@ # # Copyright 2011-2013 Ettus Research LLC # Copyright 2018 Ettus Research, a National Instruments Company +# Copyright 2024 Ettus Research, a National Instruments Company # # SPDX-License-Identifier: GPL-3.0-or-later # @@ -9,29 +10,55 @@ # This file included, use CMake directory variables ######################################################################## include(CheckIncludeFileCXX) +include(CheckCXXCompilerFlag) message(STATUS "") ######################################################################## -# Check for SSE2 SIMD headers +# Check for x86 SIMD compiler support ######################################################################## -if(CMAKE_COMPILER_IS_GNUCXX) - set(EMMINTRIN_FLAGS -msse2) - set(TMMINTRIN_FLAGS -mssse3) -elseif(MSVC) - set(EMMINTRIN_FLAGS /arch:SSE2) + +# Check if we're on an x86 platform +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86") + set(IS_X86_PLATFORM TRUE) +else() + set(IS_X86_PLATFORM FALSE) endif() -set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS}) -CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H) -unset(CMAKE_REQUIRED_FLAGS) +if(IS_X86_PLATFORM) + # Check for SSE2 compiler support + if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + check_cxx_compiler_flag("-msse2" COMPILER_SUPPORTS_SSE2) + check_cxx_compiler_flag("-mssse3" COMPILER_SUPPORTS_SSSE3) + check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) + set(SSE2_FLAGS -msse2) + set(SSSE3_FLAGS -mssse3) + set(AVX2_FLAGS -mavx2) + elseif(MSVC) + # MSVC doesn't need flags for SSE2 on x64, but we can check + set(COMPILER_SUPPORTS_SSE2 TRUE) + set(COMPILER_SUPPORTS_SSSE3 TRUE) + check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2) + set(SSE2_FLAGS "") # Default on x64 + set(SSSE3_FLAGS "") # Default on x64 + set(AVX2_FLAGS /arch:AVX2) + endif() -if(ENABLE_SSSE3) -set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS}) -CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H) -unset(CMAKE_REQUIRED_FLAGS) -endif(ENABLE_SSSE3) + if(COMPILER_SUPPORTS_SSE2) + message(STATUS "Compiler supports SSE2 - will build SSE2 converters") + endif() + if(COMPILER_SUPPORTS_SSSE3) + message(STATUS "Compiler supports SSSE3 - will build SSSE3 converters") + endif() + if(COMPILER_SUPPORTS_AVX2) + message(STATUS "Compiler supports AVX2 - will build AVX2 converters") + endif() +endif() -if(HAVE_EMMINTRIN_H) +######################################################################## +# x86 SIMD converter sources +######################################################################## + +if(COMPILER_SUPPORTS_SSE2) set(convert_with_sse2_sources ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp @@ -45,22 +72,40 @@ if(HAVE_EMMINTRIN_H) ) set_source_files_properties( ${convert_with_sse2_sources} - PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}" + PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}" ) LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources}) -endif(HAVE_EMMINTRIN_H) +endif() -if(HAVE_TMMINTRIN_H) +if(COMPILER_SUPPORTS_SSSE3) set(convert_with_ssse3_sources ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp ) set_source_files_properties( ${convert_with_ssse3_sources} - PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}" + PROPERTIES COMPILE_FLAGS "${SSSE3_FLAGS}" ) LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources}) -endif(HAVE_TMMINTRIN_H) +endif() + +if(ENABLE_AVX2 AND COMPILER_SUPPORTS_AVX2) + set(convert_with_avx2_sources + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp + ) + set_source_files_properties( + ${convert_with_avx2_sources} + PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS}" + ) + LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources}) +endif() ######################################################################## # Check for NEON SIMD headers diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp new file mode 100644 index 0000000000..e00cc04df2 --- /dev/null +++ b/host/lib/convert/avx2_fc32_to_sc16.cpp @@ -0,0 +1,120 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256 tmplo = _mm256_loadu_ps(reinterpret_cast(input + i + 0)); + __m256 tmphi = _mm256_loadu_ps(reinterpret_cast(input + i + 4)); + + /* convert and scale */ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); + + __m256i shuffled_lo = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ + __m256i shuffled_hi = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ + + /* now pack the shuffled data sequentially */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + + /* pack + swap 16-bit pairs */ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256 tmplo = _mm256_loadu_ps(reinterpret_cast(input + i + 0)); + __m256 tmphi = _mm256_loadu_ps(reinterpret_cast(input + i + 4)); + + /* convert and scale */ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); + + __m256i shuffled_lo = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ + __m256i shuffled_hi = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ + + /* Now pack the shuffled data sequentially */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256 tmplo = _mm256_loadu_ps(reinterpret_cast(input + i + 0)); + __m256 tmphi = _mm256_loadu_ps(reinterpret_cast(input + i + 4)); + + /* convert and scale */ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); + + /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ + __m256i shuffled_lo = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ + __m256i shuffled_hi = _mm256_permute2x128_si256( + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ + + /* Now pack the shuffled data sequentially */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert any remaining samples + xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc32_to_sc8.cpp b/host/lib/convert/avx2_fc32_to_sc8.cpp new file mode 100644 index 0000000000..bf80423eab --- /dev/null +++ b/host/lib/convert/avx2_fc32_to_sc8.cpp @@ -0,0 +1,98 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +template +UHD_INLINE __m256i pack_sc32_4x(const __m256& in0, + const __m256& in1, + const __m256& in2, + const __m256& in3, + const __m256& scalar) +{ + __m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar)); + tmpi0 = _mm256_shuffle_epi32(tmpi0, shuf); + __m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar)); + tmpi1 = _mm256_shuffle_epi32(tmpi1, shuf); + + __m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20); + __m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31); + const __m256i lo = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi); + + __m256i tmpi2 = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar)); + tmpi2 = _mm256_shuffle_epi32(tmpi2, shuf); + __m256i tmpi3 = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar)); + tmpi3 = _mm256_shuffle_epi32(tmpi3, shuf); + + __m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20); + __m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31); + const __m256i hi = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi); + + __m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20); + __m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31); + + return _mm256_packs_epi16(shuf_lo, shuf_hi); +} + +DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + + size_t i = 0; + + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { + /* load from input */ + __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast(input + i + 0)); + __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast(input + i + 4)); + __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast(input + i + 8)); + __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast(input + i + 12)); + + /* convert */ + const __m256i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + + size_t i = 0; + + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { + /* load from input */ + __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast(input + i + 0)); + __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast(input + i + 4)); + __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast(input + i + 8)); + __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast(input + i + 12)); + + /* convert */ + const __m256i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc64_to_sc16.cpp b/host/lib/convert/avx2_fc64_to_sc16.cpp new file mode 100644 index 0000000000..5794b3c9e2 --- /dev/null +++ b/host/lib/convert/avx2_fc64_to_sc16.cpp @@ -0,0 +1,139 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast(input + i + 0)); + __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast(input + i + 2)); + __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast(input + i + 4)); + __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast(input + i + 6)); + + /* convert and scale */ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); + + /* Unpack and interleave the results */ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); + + /* Pack and swap 16-bit pairs */ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); + + /* pack + swap 16-bit pairs */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + + /* pack + swap 16-bit pairs */ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert remainder + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast(input + i + 0)); + __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast(input + i + 2)); + __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast(input + i + 4)); + __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast(input + i + 6)); + + /* convert and scale */ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); + + /* Unpack and interleave the results */ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); + + /* Pack and swap 16-bit pairs */ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); + + /* pack + swap 16-bit pairs */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert remainder + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast(input + i + 0)); + __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast(input + i + 2)); + __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast(input + i + 4)); + __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast(input + i + 6)); + + /* convert and scale */ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); + + /* Unpack and interleave the results */ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); + + /* Pack and swap 16-bit pairs */ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); + + /* pack + swap 16-bit pairs */ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); + } + + // convert remainder + xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc64_to_sc8.cpp b/host/lib/convert/avx2_fc64_to_sc8.cpp new file mode 100644 index 0000000000..ba320f3eb5 --- /dev/null +++ b/host/lib/convert/avx2_fc64_to_sc8.cpp @@ -0,0 +1,103 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +UHD_INLINE __m256i pack_sc8_item32_4x( + const __m256i& in0, const __m256i& in1, const __m256i& in2, const __m256i& in3) +{ + const __m256i shuffled_in0_lo = _mm256_permute2x128_si256(in0, in1, 0x20); + const __m256i shuffled_in0_hi = _mm256_permute2x128_si256(in0, in1, 0x31); + const __m256i shuffled_in1_lo = _mm256_permute2x128_si256(in2, in3, 0x20); + const __m256i shuffled_in1_hi = _mm256_permute2x128_si256(in2, in3, 0x31); + + const __m256i lo = _mm256_packs_epi32(shuffled_in0_lo, shuffled_in0_hi); + const __m256i hi = _mm256_packs_epi32(shuffled_in1_lo, shuffled_in1_hi); + return _mm256_packs_epi16(lo, hi); +} + +UHD_INLINE __m256i pack_sc32_4x( + const __m256d& lo, const __m256d& hi, const __m256d& scalar) +{ + const __m128i tmpi_lo = _mm256_cvttpd_epi32(_mm256_mul_pd(hi, scalar)); + const __m128i tmpi_hi = _mm256_cvttpd_epi32(_mm256_mul_pd(lo, scalar)); + + return _mm256_set_m128i(tmpi_hi, tmpi_lo); +} + +DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { + /* load from input */ + __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast(input + i + 0)); + __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast(input + i + 2)); + __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast(input + i + 4)); + __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast(input + i + 6)); + __m256d tmp4 = _mm256_loadu_pd(reinterpret_cast(input + i + 8)); + __m256d tmp5 = _mm256_loadu_pd(reinterpret_cast(input + i + 10)); + __m256d tmp6 = _mm256_loadu_pd(reinterpret_cast(input + i + 12)); + __m256d tmp7 = _mm256_loadu_pd(reinterpret_cast(input + i + 14)); + + /* interleave */ + const __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), + pack_sc32_4x(tmp3, tmp2, scalar), + pack_sc32_4x(tmp5, tmp4, scalar), + pack_sc32_4x(tmp7, tmp6, scalar)); + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { + /* load from input */ + __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast(input + i + 0)); + __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast(input + i + 2)); + __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast(input + i + 4)); + __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast(input + i + 6)); + __m256d tmp4 = _mm256_loadu_pd(reinterpret_cast(input + i + 8)); + __m256d tmp5 = _mm256_loadu_pd(reinterpret_cast(input + i + 10)); + __m256d tmp6 = _mm256_loadu_pd(reinterpret_cast(input + i + 12)); + __m256d tmp7 = _mm256_loadu_pd(reinterpret_cast(input + i + 14)); + + /* interleave */ + __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar), + pack_sc32_4x(tmp2, tmp3, scalar), + pack_sc32_4x(tmp4, tmp5, scalar), + pack_sc32_4x(tmp6, tmp7, scalar)); + tmpi = _mm256_or_si256( + _mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); /*byteswap*/ + + /* store to output */ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_sc16_to_fc32.cpp b/host/lib/convert/avx2_sc16_to_fc32.cpp new file mode 100644 index 0000000000..338f74253a --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_fc32.cpp @@ -0,0 +1,112 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* swap 16-bit pairs: [imag, real] -> [real, imag] */ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + + /* split into 128-bit halves and sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi)); + __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1)); + + /* convert to float and scale */ + __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar); + __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar); + + /* store to output */ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), tmplo); + _mm256_storeu_ps(reinterpret_cast(output + i + 4), tmphi); + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* byteswap within each 16-bit word */ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); + + /* split into 128-bit halves and sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi)); + __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1)); + + /* convert to float and scale */ + __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar); + __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar); + + /* store to output */ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), tmplo); + _mm256_storeu_ps(reinterpret_cast(output + i + 4), tmphi); + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_chdr, 1, fc32, 1, PRIORITY_SIMD_AVX2) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load 8 complex samples as 2x 128-bit halves */ + __m128i in_lo = _mm_loadu_si128(reinterpret_cast(input + i)); + __m128i in_hi = _mm_loadu_si128(reinterpret_cast(input + i + 4)); + + /* sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(in_lo); + __m256i int32_hi = _mm256_cvtepi16_epi32(in_hi); + + /* convert to float and scale */ + __m256 float_lo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar); + __m256 float_hi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar); + + /* store to output */ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), float_lo); + _mm256_storeu_ps(reinterpret_cast(output + i + 4), float_hi); + } + + // convert any remaining samples + for (; i < nsamps; i++) { + output[i] = fc32_t(float(input[i].real()) * float(scale_factor), + float(input[i].imag()) * float(scale_factor)); + } +} diff --git a/host/lib/convert/avx2_sc16_to_fc64.cpp b/host/lib/convert/avx2_sc16_to_fc64.cpp new file mode 100644 index 0000000000..4841089dd1 --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_fc64.cpp @@ -0,0 +1,121 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* swap 16-bit pairs: [imag, real] -> [real, imag] */ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); + + /* sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi)); + __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1)); + + /* convert to double and scale */ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar); + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar); + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar); + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar); + + /* store to output */ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); + } + + // convert remainder + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* byteswap within each 16-bit word */ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); + + /* sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi)); + __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1)); + + /* convert to double and scale */ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar); + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar); + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar); + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar); + + /* store to output */ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); + } + + // convert remainder + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_chdr, 1, fc64, 1, PRIORITY_SIMD_AVX2) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + /* load 8 complex samples as 2x 128-bit halves */ + __m128i in_lo = _mm_loadu_si128(reinterpret_cast(input + i)); + __m128i in_hi = _mm_loadu_si128(reinterpret_cast(input + i + 4)); + + /* sign-extend int16 to int32 */ + __m256i int32_lo = _mm256_cvtepi16_epi32(in_lo); + __m256i int32_hi = _mm256_cvtepi16_epi32(in_hi); + + /* convert to double and scale */ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar); + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar); + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar); + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar); + + /* store to output */ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); + } + + // convert remainder + chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_sc16_to_sc16.cpp b/host/lib/convert/avx2_sc16_to_sc16.cpp new file mode 100644 index 0000000000..76192e3eb4 --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_sc16.cpp @@ -0,0 +1,113 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + __m256i m0; + + /* load from input */ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); + + /* swap 16-bit pairs */ + m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); + m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); + + /* store to output */ + _mm256_storeu_si256((__m256i*)(output + i), m0); + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + __m256i m0, m1, m2; + + /* load from input */ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); + + /* byteswap 16 bit words */ + m1 = _mm256_srli_epi16(m0, 8); + m2 = _mm256_slli_epi16(m0, 8); + m0 = _mm256_or_si256(m1, m2); + + /* store to output */ + _mm256_storeu_si256((__m256i*)(output + i), m0); + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + __m256i m0; + + /* load from input */ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); + + /* swap 16-bit pairs */ + m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); + m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); + + /* store to output */ + _mm256_storeu_si256((__m256i*)(output + i), m0); + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + for (; i + 7 < nsamps; i += 8) { + __m256i m0, m1, m2; + + /* load from input */ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); + + /* byteswap 16 bit words */ + m1 = _mm256_srli_epi16(m0, 8); + m2 = _mm256_slli_epi16(m0, 8); + m0 = _mm256_or_si256(m1, m2); + + /* store to output */ + _mm256_storeu_si256((__m256i*)(output + i), m0); + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, 1.0); +} diff --git a/host/lib/convert/avx2_sc8_to_fc32.cpp b/host/lib/convert/avx2_sc8_to_fc32.cpp new file mode 100644 index 0000000000..50b9befb7a --- /dev/null +++ b/host/lib/convert/avx2_sc8_to_fc32.cpp @@ -0,0 +1,105 @@ +// +// Copyright 2026 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +static const __m256i zeroi = _mm256_setzero_si256(); + +template +UHD_INLINE void unpack_sc32_4x(const __m256i& in, + __m256& out0, + __m256& out1, + __m256& out2, + __m256& out3, + const __m256& scalar) +{ + const __m256i tmplo = _mm256_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ + __m256i tmp0 = _mm256_shuffle_epi32( + _mm256_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ + __m256i tmp1 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmplo), shuf); + out0 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp0), scalar); + out1 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp1), scalar); + + const __m256i tmphi = _mm256_unpackhi_epi8(zeroi, in); + __m256i tmp2 = _mm256_shuffle_epi32(_mm256_unpacklo_epi16(zeroi, tmphi), shuf); + __m256i tmp3 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmphi), shuf); + out2 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp2), scalar); + out3 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp3), scalar); +} + +DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + + size_t i = 0, j = 0; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0) { + item32_sc8_to_xx(input++, output++, 1, scale_factor); + num_samps--; + } + + for (; j + 15 < num_samps; j += 16, i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* unpack + swap 8-bit pairs */ + __m256 tmp0, tmp1, tmp2, tmp3; + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); + + /* store to output */ + _mm256_storeu_ps(reinterpret_cast(output + j + 0), tmp0); + _mm256_storeu_ps(reinterpret_cast(output + j + 4), tmp1); + _mm256_storeu_ps(reinterpret_cast(output + j + 8), tmp2); + _mm256_storeu_ps(reinterpret_cast(output + j + 12), tmp3); + } + + // convert remainder + item32_sc8_to_xx(input + i, output + j, num_samps - j, scale_factor); +} + +DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD_AVX2) +{ + const item32_t* input = reinterpret_cast(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + + size_t i = 0, j = 0; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0) { + item32_sc8_to_xx(input++, output++, 1, scale_factor); + num_samps--; + } + + for (; j + 15 < num_samps; j += 16, i += 8) { + /* load from input */ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); + + /* unpack + swap 8-bit pairs */ + __m256 tmp0, tmp1, tmp2, tmp3; + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); + + /* store to output */ + _mm256_storeu_ps(reinterpret_cast(output + j + 0), tmp0); + _mm256_storeu_ps(reinterpret_cast(output + j + 4), tmp1); + _mm256_storeu_ps(reinterpret_cast(output + j + 8), tmp2); + _mm256_storeu_ps(reinterpret_cast(output + j + 12), tmp3); + } + + // convert remainder + item32_sc8_to_xx(input + i, output + j, num_samps - j, scale_factor); +} diff --git a/host/lib/convert/convert_common.hpp b/host/lib/convert/convert_common.hpp index 97c0351329..a81ece72fb 100644 --- a/host/lib/convert/convert_common.hpp +++ b/host/lib/convert/convert_common.hpp @@ -64,6 +64,17 @@ /*********************************************************************** * Setup priorities + * + * Higher priority = preferred implementation + * When get_converter() is called with priority=-1, it returns the + * highest priority converter available. + * + * Priority hierarchy: + * PRIORITY_EMPTY = -1 (empty/null converter) + * PRIORITY_GENERAL = 0 (generic C++ implementation) + * PRIORITY_TABLE = 1 (table lookup) + * PRIORITY_SIMD = 3 (SSE2/NEON - baseline SIMD) + * PRIORITY_SIMD_AVX2 = 4 (AVX2 - 256-bit SIMD) **********************************************************************/ static const int PRIORITY_GENERAL = 0; static const int PRIORITY_EMPTY = -1; @@ -72,10 +83,12 @@ static const int PRIORITY_EMPTY = -1; static const int PRIORITY_SIMD = 2; static const int PRIORITY_TABLE = 1; // tables require large cache, so they are slower on arm +static const int PRIORITY_SIMD_AVX2 = 2; // Not applicable on ARM #else // We used to have ORC, too, so SIMD is 3 -static const int PRIORITY_SIMD = 3; -static const int PRIORITY_TABLE = 1; +static const int PRIORITY_SIMD = 3; +static const int PRIORITY_TABLE = 1; +static const int PRIORITY_SIMD_AVX2 = 4; #endif /*********************************************************************** diff --git a/host/tests/convert_test.cpp b/host/tests/convert_test.cpp index cf6c6a00ed..40fb448601 100644 --- a/host/tests/convert_test.cpp +++ b/host/tests/convert_test.cpp @@ -43,7 +43,7 @@ struct benchmark_result // List of priority types. This must be manually kept in sync with whatever is // defined in convert_common.hpp -const std::array CONV_PRIO_TYPES{-1, 0, 1, 2, 3}; +const std::array CONV_PRIO_TYPES{-1, 0, 1, 2, 3, 4}; // Use this to create a converter with fixed prio in a test case. If prio does // not exist, we simply exit the test case. That's normal. diff --git a/host/utils/converter_benchmark.cpp b/host/utils/converter_benchmark.cpp index e02329e017..9580d1e9ab 100644 --- a/host/utils/converter_benchmark.cpp +++ b/host/utils/converter_benchmark.cpp @@ -14,11 +14,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include namespace po = boost::program_options; using namespace uhd::convert; @@ -261,6 +264,240 @@ std::string item_to_string( } } +// For batch of benchmarks +std::string get_priority_name(int prio) +{ + switch (prio) { + case 0: + return "Generic"; + case 1: + return "Unrolled"; + case 2: + return "NEON"; + case 3: + return "SSE2/SSSE3"; + case 4: + return "AVX2"; + default: + return "Unknown(" + std::to_string(prio) + ")"; + } +} + +struct ConverterConfig +{ + std::string input_format; + std::string output_format; + double scale_factor; + std::string description; +}; + +std::vector get_converter_configs() +{ + return { + // sc16 <-> fc32 + {"sc16_item32_le", "fc32", 1.0 / 32768.0, "Wire LE to float"}, + {"fc32", "sc16_item32_le", 32768.0, "Float to wire LE"}, + {"sc16_item32_be", "fc32", 1.0 / 32768.0, "Wire BE to float"}, + {"fc32", "sc16_item32_be", 32768.0, "Float to wire BE"}, + {"sc16_chdr", "fc32", 1.0 / 32768.0, "CHDR to float"}, + {"fc32", "sc16_chdr", 32768.0, "Float to CHDR"}, + + // sc16 <-> fc64 + {"sc16_item32_le", "fc64", 1.0 / 32768.0, "Wire LE to double"}, + {"fc64", "sc16_item32_le", 32768.0, "Double to wire LE"}, + {"sc16_chdr", "fc64", 1.0 / 32768.0, "CHDR to double"}, + {"fc64", "sc16_chdr", 32768.0, "Double to CHDR"}, + + // sc8 <-> fc32 + {"sc8_item32_le", "fc32", 1.0 / 128.0, "8-bit wire to float"}, + {"fc32", "sc8_item32_le", 128.0, "Float to 8-bit wire"}, + + // sc16 passthrough + {"sc16_item32_le", "sc16", 1.0, "Wire LE to native sc16"}, + {"sc16", "sc16_item32_le", 1.0, "Native sc16 to wire LE"}, + {"sc16_item32_be", "sc16", 1.0, "Wire BE to native sc16"}, + {"sc16", "sc16_item32_be", 1.0, "Native sc16 to wire BE"}, + + // sc12 + {"sc12_item32_le", "sc16", 1.0, "12-bit to sc16"}, + {"sc16", "sc12_item32_le", 1.0, "sc16 to 12-bit"}, + {"sc12_item32_le", "fc32", 1.0 / 2048.0, "12-bit to float"}, + {"fc32", "sc12_item32_le", 2048.0, "Float to 12-bit"}, + }; +} + +struct BenchmarkResult +{ + id_type id; + int priority; + std::string priority_name; + size_t buffer_size; + double ns_per_sample; + double samples_per_sec; + double throughput_gbps; + size_t bytes_per_sample; +}; + +BenchmarkResult run_batch_benchmark(id_type id, + int prio, + size_t buffer_size, + size_t iterations, + double scale_factor = 1.0) +{ + BenchmarkResult result; + result.id = id; + result.priority = prio; + result.priority_name = get_priority_name(prio); + result.buffer_size = buffer_size; + + converter::sptr conv; + try { + conv = get_converter(id, prio)(); + } catch (uhd::key_error&) { + result.ns_per_sample = -1; + result.samples_per_sec = 0; + result.throughput_gbps = 0; + return result; + } + + conv->set_scalar(scale_factor); + + const size_t alignment = 64; + const size_t alloc_size = buffer_size * 16 + alignment; + std::vector input_storage(alloc_size); + std::vector output_storage(alloc_size); + + void* input_ptr = reinterpret_cast( + (reinterpret_cast(input_storage.data()) + alignment - 1) + & ~(alignment - 1)); + void* output_ptr = reinterpret_cast( + (reinterpret_cast(output_storage.data()) + alignment - 1) + & ~(alignment - 1)); + + // Initialize input with deterministic data + uint32_t* input_u32 = static_cast(input_ptr); + for (size_t i = 0; i < buffer_size * 4; i++) { + input_u32[i] = static_cast(i * 12345 + 67890); + } + + std::vector input_buf_refs(1, input_ptr); + std::vector output_buf_refs(1, output_ptr); + + // Warm-up runs + conv->conv(input_buf_refs, output_buf_refs, buffer_size); + conv->conv(input_buf_refs, output_buf_refs, buffer_size); + + // Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (size_t iter = 0; iter < iterations; iter++) { + conv->conv(input_buf_refs, output_buf_refs, buffer_size); + } + auto end = std::chrono::high_resolution_clock::now(); + + double elapsed_ns = + std::chrono::duration_cast(end - start).count(); + double total_samples = static_cast(buffer_size) * iterations; + + result.ns_per_sample = elapsed_ns / total_samples; + result.samples_per_sec = total_samples / (elapsed_ns * 1e-9); + const std::string in_type = format_to_type(id.input_format); + const std::string out_type = format_to_type(id.output_format); + result.bytes_per_sample = + get_bytes_per_item(in_type) + get_bytes_per_item(out_type); + result.throughput_gbps = + (result.samples_per_sec * result.bytes_per_sample * 8) / 1e9; + + return result; +} + +void print_results_table(const std::vector& results, + const std::string& title, + bool show_throughput = false) +{ + std::cout << "\n" << std::string(80, '=') << "\n"; + std::cout << title << "\n"; + std::cout << std::string(80, '=') << "\n"; + + // Group by conversion type + std::map> grouped; + for (const auto& r : results) { + std::string key = r.id.input_format + " -> " + r.id.output_format; + grouped[key].push_back(r); + } + + for (const auto& kv : grouped) { + std::cout << "\n" << kv.first << ":\n"; + std::cout << std::string(70, '-') << "\n"; + + if (show_throughput) { + std::cout << std::setw(12) << "Buffer Size" << std::setw(15) << "Priority" + << std::setw(15) << "ns/sample" << std::setw(15) << "MSamples/s" + << std::setw(12) << "Gbps" << "\n"; + } else { + std::cout << std::setw(12) << "Buffer Size" << std::setw(15) << "Priority" + << std::setw(15) << "ns/sample" << std::setw(15) << "MSamples/s" + << "\n"; + } + std::cout << std::string(70, '-') << "\n"; + + for (const auto& r : kv.second) { + if (r.ns_per_sample < 0) { + std::cout << std::setw(12) << r.buffer_size << std::setw(15) + << r.priority_name << std::setw(15) << "N/A" << std::setw(15) + << "N/A" << "\n"; + } else { + std::cout << std::setw(12) << r.buffer_size << std::setw(15) + << r.priority_name << std::setw(15) << std::fixed + << std::setprecision(3) << r.ns_per_sample << std::setw(15) + << std::setprecision(2) << r.samples_per_sec / 1e6; + if (show_throughput) { + std::cout << std::setw(12) << std::setprecision(2) + << r.throughput_gbps; + } + std::cout << "\n"; + } + } + } +} + +void print_comparison_table(const std::vector& results, + const std::string& conversion, + size_t buffer_size) +{ + std::cout << "\n" << conversion << " @ " << buffer_size << " samples:\n"; + std::cout << std::string(60, '-') << "\n"; + std::cout << std::setw(15) << "Priority" << std::setw(15) << "ns/sample" + << std::setw(15) << "MSamples/s" << std::setw(15) << "Speedup" << "\n"; + std::cout << std::string(60, '-') << "\n"; + + // Find baseline (Generic, prio 0) + double baseline_ns = 0; + for (const auto& r : results) { + if (r.priority == 0 && r.ns_per_sample > 0) { + baseline_ns = r.ns_per_sample; + break; + } + } + + for (const auto& r : results) { + if (r.ns_per_sample < 0) { + std::cout << std::setw(15) << r.priority_name << std::setw(15) << "N/A" + << std::setw(15) << "N/A" << std::setw(15) << "N/A" << "\n"; + } else if (baseline_ns <= 0) { + std::cout << std::setw(15) << r.priority_name << std::setw(15) << std::fixed + << std::setprecision(3) << r.ns_per_sample << std::setw(15) + << std::setprecision(2) << r.samples_per_sec / 1e6 << std::setw(15) + << "N/A" << "\n"; + } else { + double speedup = baseline_ns / r.ns_per_sample; + std::cout << std::setw(15) << r.priority_name << std::setw(15) << std::fixed + << std::setprecision(3) << r.ns_per_sample << std::setw(15) + << std::setprecision(2) << r.samples_per_sec / 1e6 << std::setw(14) + << std::setprecision(2) << speedup << "x" << "\n"; + } + } +} + int UHD_SAFE_MAIN(int argc, char* argv[]) { std::string in_format, out_format; @@ -281,12 +518,18 @@ int UHD_SAFE_MAIN(int argc, char* argv[]) ("samples", po::value(&n_samples)->default_value(1000000), "Number of samples per iteration") ("iterations", po::value(&iterations)->default_value(10000), "Number of iterations per benchmark") ("priorities", po::value(&priorities)->default_value("default"), "Converter priorities. Can be 'default', 'all', or a comma-separated list of priorities.") - ("max-prio", po::value(&max_prio)->default_value(4), "Largest available priority (advanced feature)") + ("max-prio", po::value(&max_prio)->default_value(5), "Largest available priority (advanced feature)") ("n-inputs", po::value(&n_inputs)->default_value(1), "Number of input vectors") ("n-outputs", po::value(&n_outputs)->default_value(1), "Number of output vectors") ("debug-converter", "Skip benchmark and print conversion results. Implies iterations==1 and will only run on a single converter.") ("seed-mode", po::value(&seed_mode)->default_value("random"), "How to initialize the data: random, incremental") ("hex", "When using debug mode, dump memory in hex") + ("batch", "Run batch benchmark across all predefined converter configurations") + ("buffer-sizes", po::value()->default_value("64,256,1024,4096,16384,65536,262144"), "Buffer sizes for batch mode (comma-separated)") + ("compare", "Show comparison tables with speedup (batch mode)") + ("throughput", "Show throughput in Gbps (batch mode)") + ("csv", "Output batch results in CSV format") + ("quick", "Quick batch mode: fewer buffer sizes and iterations") ; // clang-format on po::variables_map vm; @@ -309,6 +552,172 @@ int UHD_SAFE_MAIN(int argc, char* argv[]) return EXIT_FAILURE; } + // Batch mode: run predefined converter configs across multiple buffer sizes + if (vm.count("batch")) { + std::vector buffer_sizes; + std::string sizes_str = vm["buffer-sizes"].as(); + if (vm.count("quick")) { + sizes_str = "256,4096,65536"; + } + { + std::stringstream ss(sizes_str); + std::string item; + while (std::getline(ss, item, ',')) { + buffer_sizes.push_back(std::stoull(item)); + } + } + + std::vector batch_priorities; + { + // Use max_prio to determine which priorities to test + for (int i = 0; i < static_cast(max_prio); i++) { + batch_priorities.push_back(i); + } + } + + size_t batch_iterations = iterations; + if (vm.count("quick")) { + batch_iterations = 20; + } + + std::string filter; + bool show_comparison = vm.count("compare") > 0; + bool show_throughput = vm.count("throughput") > 0; + bool csv_output = vm.count("csv") > 0; + auto converter_configs = get_converter_configs(); + + std::cout << "========================================\n"; + std::cout << "UHD Converter Batch Benchmark\n"; + std::cout << "========================================\n"; + std::cout << "Buffer sizes: "; + for (auto s : buffer_sizes) + std::cout << s << " "; + std::cout << "\n"; + std::cout << "Iterations: " << batch_iterations << "\n"; + std::cout << "Priorities: "; + for (auto p : batch_priorities) + std::cout << p << " "; + std::cout << "\n\n"; + + std::vector all_results; + + if (csv_output) { + std::cout << "input_format,output_format,buffer_size,priority," + "priority_name,ns_per_sample,msamples_per_sec,gbps\n"; + } + + // Run benchmarks + for (const auto& config : converter_configs) { + id_type bid; + bid.input_format = config.input_format; + bid.num_inputs = 1; + bid.output_format = config.output_format; + bid.num_outputs = 1; + + std::string conv_name = + config.input_format + " -> " + config.output_format; + + if (!csv_output) { + std::cout << "Benchmarking: " << conv_name << " (" + << config.description << ")...\n"; + } + + std::vector conv_results; + + for (size_t buf_size : buffer_sizes) { + for (int bprio : batch_priorities) { + auto result = run_batch_benchmark( + bid, bprio, buf_size, batch_iterations, config.scale_factor); + + if (csv_output && result.ns_per_sample > 0) { + std::cout << config.input_format << "," + << config.output_format << "," << buf_size + << "," << bprio << "," << result.priority_name + << "," << std::fixed << std::setprecision(3) + << result.ns_per_sample << "," + << std::setprecision(2) + << result.samples_per_sec / 1e6 << "," + << result.throughput_gbps << "\n"; + } + + all_results.push_back(result); + conv_results.push_back(result); + } + } + + if (show_comparison && !csv_output) { + for (size_t buf_size : buffer_sizes) { + std::vector size_results; + for (const auto& r : conv_results) { + if (r.buffer_size == buf_size) { + size_results.push_back(r); + } + } + print_comparison_table(size_results, conv_name, buf_size); + } + } + } + + // summary tables + if (!csv_output) { + print_results_table(all_results, "All Benchmark Results", show_throughput); + + // Print fastest converter for each type at largest buffer size + std::cout << "\n" << std::string(80, '=') << "\n"; + std::cout << "Summary: Fastest Converter for Each Type\n"; + std::cout << std::string(80, '=') << "\n"; + std::cout << std::setw(35) << "Conversion" << std::setw(15) + << "Best Priority" << std::setw(15) << "ns/sample" + << std::setw(15) << "Speedup vs Gen" << "\n"; + std::cout << std::string(80, '-') << "\n"; + + size_t largest_buf = + *std::max_element(buffer_sizes.begin(), buffer_sizes.end()); + + // Group by conversion + std::map> by_conv; + for (const auto& r : all_results) { + if (r.buffer_size == largest_buf && r.ns_per_sample > 0) { + std::string key = + r.id.input_format + " -> " + r.id.output_format; + by_conv[key].push_back(r); + } + } + + for (auto& kv : by_conv) { + auto& results = kv.second; + auto fastest = std::min_element(results.begin(), + results.end(), + [](const BenchmarkResult& a, const BenchmarkResult& b) { + return a.ns_per_sample < b.ns_per_sample; + }); + + double generic_ns = 0; + for (const auto& r : results) { + if (r.priority == 0 && r.ns_per_sample > 0) { + generic_ns = r.ns_per_sample; + break; + } + } + + std::cout << std::setw(35) << kv.first << std::setw(15) + << fastest->priority_name << std::setw(15) << std::fixed + << std::setprecision(3) << fastest->ns_per_sample; + + if (generic_ns > 0) { + double speedup = generic_ns / fastest->ns_per_sample; + std::cout << std::setw(14) << std::setprecision(2) << speedup + << "x"; + } else { + std::cout << std::setw(15) << "N/A"; + } + std::cout << "\n"; + } + } + + return EXIT_SUCCESS; + } + // Parse more arguments if (seed_mode == "incremental") { buf_seed_mode = INC;