EttusResearch · anilgurses · Feb 23, 2026 · Feb 23, 2026 · Sep 21, 2024 · Mar 15, 2026
diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt
@@ -1,6 +1,7 @@
 #
 # Copyright 2011-2013 Ettus Research LLC
 # Copyright 2018 Ettus Research, a National Instruments Company
+# Copyright 2024 Ettus Research, a National Instruments Company
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
@@ -9,29 +10,55 @@
 # This file included, use CMake directory variables
 ########################################################################
 include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
 message(STATUS "")
 
 ########################################################################
-# Check for SSE2 SIMD headers
+# Check for x86 SIMD compiler support
 ########################################################################
-if(CMAKE_COMPILER_IS_GNUCXX)
-    set(EMMINTRIN_FLAGS -msse2)
-    set(TMMINTRIN_FLAGS -mssse3)
-elseif(MSVC)
-    set(EMMINTRIN_FLAGS /arch:SSE2)
+
+# Check if we're on an x86 platform
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86")
+    set(IS_X86_PLATFORM TRUE)
+else()
+    set(IS_X86_PLATFORM FALSE)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
+if(IS_X86_PLATFORM)
+    # Check for SSE2 compiler support
+    if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        check_cxx_compiler_flag("-msse2" COMPILER_SUPPORTS_SSE2)
+        check_cxx_compiler_flag("-mssse3" COMPILER_SUPPORTS_SSSE3)
+        check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
+        set(SSE2_FLAGS -msse2)
+        set(SSSE3_FLAGS -mssse3)
+        set(AVX2_FLAGS -mavx2)
+    elseif(MSVC)
+        # MSVC doesn't need flags for SSE2 on x64, but we can check
+        set(COMPILER_SUPPORTS_SSE2 TRUE)
+        set(COMPILER_SUPPORTS_SSSE3 TRUE)
+        check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2)
+        set(SSE2_FLAGS "")  # Default on x64
+        set(SSSE3_FLAGS "") # Default on x64
+        set(AVX2_FLAGS /arch:AVX2)
+    endif()
 
-if(ENABLE_SSSE3)
-set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-endif(ENABLE_SSSE3)
+    if(COMPILER_SUPPORTS_SSE2)
+        message(STATUS "Compiler supports SSE2 - will build SSE2 converters")
+    endif()
+    if(COMPILER_SUPPORTS_SSSE3)
+        message(STATUS "Compiler supports SSSE3 - will build SSSE3 converters")
+    endif()
+    if(COMPILER_SUPPORTS_AVX2)
+        message(STATUS "Compiler supports AVX2 - will build AVX2 converters")
+    endif()
+endif()
 
-if(HAVE_EMMINTRIN_H)
+########################################################################
+# x86 SIMD converter sources
+########################################################################
+
+if(COMPILER_SUPPORTS_SSE2)
     set(convert_with_sse2_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
@@ -45,22 +72,40 @@ if(HAVE_EMMINTRIN_H)
     )
     set_source_files_properties(
         ${convert_with_sse2_sources}
-        PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
-endif(HAVE_EMMINTRIN_H)
+endif()
 
-if(HAVE_TMMINTRIN_H)
+if(COMPILER_SUPPORTS_SSSE3)
     set(convert_with_ssse3_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
     )
     set_source_files_properties(
         ${convert_with_ssse3_sources}
-        PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSSE3_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
-endif(HAVE_TMMINTRIN_H)
+endif()
+
+if(ENABLE_AVX2 AND COMPILER_SUPPORTS_AVX2)
+    set(convert_with_avx2_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
+    )
+    set_source_files_properties(
+        ${convert_with_avx2_sources}
+        PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS}"
+    )
+    LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
+endif()
 
 ########################################################################
 # Check for NEON SIMD headers

diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp
@@ -0,0 +1,120 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* pack + swap 16-bit pairs */
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* Now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* Now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc32_to_sc8.cpp b/host/lib/convert/avx2_fc32_to_sc8.cpp
@@ -0,0 +1,98 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+template <const int shuf>
+UHD_INLINE __m256i pack_sc32_4x(const __m256& in0,
+    const __m256& in1,
+    const __m256& in2,
+    const __m256& in3,
+    const __m256& scalar)
+{
+    __m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar));
+    tmpi0         = _mm256_shuffle_epi32(tmpi0, shuf);
+    __m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar));
+    tmpi1         = _mm256_shuffle_epi32(tmpi1, shuf);
+
+    __m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20);
+    __m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31);
+    const __m256i lo   = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi);
+
+    __m256i tmpi2 = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar));
+    tmpi2         = _mm256_shuffle_epi32(tmpi2, shuf);
+    __m256i tmpi3 = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar));
+    tmpi3         = _mm256_shuffle_epi32(tmpi3, shuf);
+
+    __m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20);
+    __m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31);
+    const __m256i hi   = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi);
+
+    __m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20);
+    __m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31);
+
+    return _mm256_packs_epi16(shuf_lo, shuf_hi);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+        __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
+        __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));
+
+        /* convert */
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+        __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
+        __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));
+
+        /* convert */
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}