Skip to content
85 changes: 65 additions & 20 deletions host/lib/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#
# Copyright 2011-2013 Ettus Research LLC
# Copyright 2018 Ettus Research, a National Instruments Company
# Copyright 2024 Ettus Research, a National Instruments Company
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
Expand All @@ -9,29 +10,55 @@
# This file included, use CMake directory variables
########################################################################
include(CheckIncludeFileCXX)
include(CheckCXXCompilerFlag)
message(STATUS "")

########################################################################
# Check for SSE2 SIMD headers
# Check for x86 SIMD compiler support
########################################################################
if(CMAKE_COMPILER_IS_GNUCXX)
set(EMMINTRIN_FLAGS -msse2)
set(TMMINTRIN_FLAGS -mssse3)
elseif(MSVC)
set(EMMINTRIN_FLAGS /arch:SSE2)

# Check if we're on an x86 platform
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86")
set(IS_X86_PLATFORM TRUE)
else()
set(IS_X86_PLATFORM FALSE)
endif()

set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
unset(CMAKE_REQUIRED_FLAGS)
if(IS_X86_PLATFORM)
# Check for SSE2 compiler support
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
check_cxx_compiler_flag("-msse2" COMPILER_SUPPORTS_SSE2)
check_cxx_compiler_flag("-mssse3" COMPILER_SUPPORTS_SSSE3)
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
set(SSE2_FLAGS -msse2)
set(SSSE3_FLAGS -mssse3)
set(AVX2_FLAGS -mavx2)
elseif(MSVC)
# MSVC doesn't need flags for SSE2 on x64, but we can check
set(COMPILER_SUPPORTS_SSE2 TRUE)
set(COMPILER_SUPPORTS_SSSE3 TRUE)
check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2)
set(SSE2_FLAGS "") # Default on x64
set(SSSE3_FLAGS "") # Default on x64
set(AVX2_FLAGS /arch:AVX2)
endif()

if(ENABLE_SSSE3)
set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
unset(CMAKE_REQUIRED_FLAGS)
endif(ENABLE_SSSE3)
if(COMPILER_SUPPORTS_SSE2)
message(STATUS "Compiler supports SSE2 - will build SSE2 converters")
endif()
if(COMPILER_SUPPORTS_SSSE3)
message(STATUS "Compiler supports SSSE3 - will build SSSE3 converters")
endif()
if(COMPILER_SUPPORTS_AVX2)
message(STATUS "Compiler supports AVX2 - will build AVX2 converters")
endif()
endif()

if(HAVE_EMMINTRIN_H)
########################################################################
# x86 SIMD converter sources
########################################################################

if(COMPILER_SUPPORTS_SSE2)
set(convert_with_sse2_sources
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
Expand All @@ -45,22 +72,40 @@ if(HAVE_EMMINTRIN_H)
)
set_source_files_properties(
${convert_with_sse2_sources}
PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
endif(HAVE_EMMINTRIN_H)
endif()

if(HAVE_TMMINTRIN_H)
if(COMPILER_SUPPORTS_SSSE3)
set(convert_with_ssse3_sources
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
)
set_source_files_properties(
${convert_with_ssse3_sources}
PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
PROPERTIES COMPILE_FLAGS "${SSSE3_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
endif(HAVE_TMMINTRIN_H)
endif()

if(ENABLE_AVX2 AND COMPILER_SUPPORTS_AVX2)
set(convert_with_avx2_sources
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
)
set_source_files_properties(
${convert_with_avx2_sources}
PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
endif()

########################################################################
# Check for NEON SIMD headers
Expand Down
120 changes: 120 additions & 0 deletions host/lib/convert/avx2_fc32_to_sc16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
//
// Copyright 2026 Ettus Research, a National Instruments Brand
//
// SPDX-License-Identifier: GPL-3.0-or-later
//

#include "convert_common.hpp"
#include <uhd/utils/byteswap.hpp>
#include <immintrin.h>

using namespace uhd::convert;

DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

size_t i = 0;

for (; i + 7 < nsamps; i += 8) {
/* load from input */
__m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
__m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));

/* convert and scale */
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));

__m256i shuffled_lo = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
__m256i shuffled_hi = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */

/* now pack the shuffled data sequentially */
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);

/* pack + swap 16-bit pairs */
tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));

/* store to output */
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
}

// convert any remaining samples
xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

size_t i = 0;

for (; i + 7 < nsamps; i += 8) {
/* load from input */
__m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
__m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));

/* convert and scale */
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));

__m256i shuffled_lo = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
__m256i shuffled_hi = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */

/* Now pack the shuffled data sequentially */
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);

tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));

/* store to output */
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
}

// convert any remaining samples
xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

size_t i = 0;

for (; i + 7 < nsamps; i += 8) {
/* load from input */
__m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
__m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));

/* convert and scale */
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));

/* mm256_packs_epi32 is not sequential, it needs to be split into m128i */
__m256i shuffled_lo = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
__m256i shuffled_hi = _mm256_permute2x128_si256(
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */

/* Now pack the shuffled data sequentially */
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);

/* store to output */
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
}

// convert any remaining samples
xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
}
98 changes: 98 additions & 0 deletions host/lib/convert/avx2_fc32_to_sc8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
//
// Copyright 2026 Ettus Research, a National Instruments Brand
//
// SPDX-License-Identifier: GPL-3.0-or-later
//

#include "convert_common.hpp"
#include <uhd/utils/byteswap.hpp>
#include <immintrin.h>

using namespace uhd::convert;

template <const int shuf>
UHD_INLINE __m256i pack_sc32_4x(const __m256& in0,
const __m256& in1,
const __m256& in2,
const __m256& in3,
const __m256& scalar)
{
__m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar));
tmpi0 = _mm256_shuffle_epi32(tmpi0, shuf);
__m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar));
tmpi1 = _mm256_shuffle_epi32(tmpi1, shuf);

__m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20);
__m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31);
const __m256i lo = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi);

__m256i tmpi2 = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar));
tmpi2 = _mm256_shuffle_epi32(tmpi2, shuf);
__m256i tmpi3 = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar));
tmpi3 = _mm256_shuffle_epi32(tmpi3, shuf);

__m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20);
__m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31);
const __m256i hi = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi);

__m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20);
__m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31);

return _mm256_packs_epi16(shuf_lo, shuf_hi);
}

DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));
const int shuf = _MM_SHUFFLE(3, 2, 1, 0);

size_t i = 0;

for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
/* load from input */
__m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
__m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
__m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
__m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));

/* convert */
const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);

/* store to output */
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
}

// convert remainder
xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));
const int shuf = _MM_SHUFFLE(0, 1, 2, 3);

size_t i = 0;

for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
/* load from input */
__m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
__m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
__m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
__m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));

/* convert */
const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);

/* store to output */
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
}

// convert remainder
xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
}
Loading
Loading