diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt
index aee62c82f4..ecddcd07f0 100644
--- a/host/lib/convert/CMakeLists.txt
+++ b/host/lib/convert/CMakeLists.txt
@@ -1,6 +1,7 @@
 #
 # Copyright 2011-2013 Ettus Research LLC
 # Copyright 2018 Ettus Research, a National Instruments Company
+# Copyright 2024 Ettus Research, a National Instruments Company
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
@@ -9,29 +10,55 @@
 # This file included, use CMake directory variables
 ########################################################################
 include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
 message(STATUS "")
 
 ########################################################################
-# Check for SSE2 SIMD headers
+# Check for x86 SIMD compiler support
 ########################################################################
-if(CMAKE_COMPILER_IS_GNUCXX)
-    set(EMMINTRIN_FLAGS -msse2)
-    set(TMMINTRIN_FLAGS -mssse3)
-elseif(MSVC)
-    set(EMMINTRIN_FLAGS /arch:SSE2)
+
+# Check if we're on an x86 platform
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i[3-6]86")
+    set(IS_X86_PLATFORM TRUE)
+else()
+    set(IS_X86_PLATFORM FALSE)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
+if(IS_X86_PLATFORM)
+    # Check for SSE2 compiler support
+    if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        check_cxx_compiler_flag("-msse2" COMPILER_SUPPORTS_SSE2)
+        check_cxx_compiler_flag("-mssse3" COMPILER_SUPPORTS_SSSE3)
+        check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
+        set(SSE2_FLAGS -msse2)
+        set(SSSE3_FLAGS -mssse3)
+        set(AVX2_FLAGS -mavx2)
+    elseif(MSVC)
+        # MSVC doesn't need flags for SSE2 on x64, but we can check
+        set(COMPILER_SUPPORTS_SSE2 TRUE)
+        set(COMPILER_SUPPORTS_SSSE3 TRUE)
+        check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2)
+        set(SSE2_FLAGS "")  # Default on x64
+        set(SSSE3_FLAGS "") # Default on x64
+        set(AVX2_FLAGS /arch:AVX2)
+    endif()
 
-if(ENABLE_SSSE3)
-set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-endif(ENABLE_SSSE3)
+    if(COMPILER_SUPPORTS_SSE2)
+        message(STATUS "Compiler supports SSE2 - will build SSE2 converters")
+    endif()
+    if(COMPILER_SUPPORTS_SSSE3)
+        message(STATUS "Compiler supports SSSE3 - will build SSSE3 converters")
+    endif()
+    if(COMPILER_SUPPORTS_AVX2)
+        message(STATUS "Compiler supports AVX2 - will build AVX2 converters")
+    endif()
+endif()
 
-if(HAVE_EMMINTRIN_H)
+########################################################################
+# x86 SIMD converter sources
+########################################################################
+
+if(COMPILER_SUPPORTS_SSE2)
     set(convert_with_sse2_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
@@ -45,22 +72,40 @@ if(HAVE_EMMINTRIN_H)
     )
     set_source_files_properties(
         ${convert_with_sse2_sources}
-        PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
-endif(HAVE_EMMINTRIN_H)
+endif()
 
-if(HAVE_TMMINTRIN_H)
+if(COMPILER_SUPPORTS_SSSE3)
     set(convert_with_ssse3_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
     )
     set_source_files_properties(
         ${convert_with_ssse3_sources}
-        PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSSE3_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
-endif(HAVE_TMMINTRIN_H)
+endif()
+
+if(ENABLE_AVX2 AND COMPILER_SUPPORTS_AVX2)
+    set(convert_with_avx2_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
+    )
+    set_source_files_properties(
+        ${convert_with_avx2_sources}
+        PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS}"
+    )
+    LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
+endif()
 
 ########################################################################
 # Check for NEON SIMD headers
diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp
new file mode 100644
index 0000000000..e00cc04df2
--- /dev/null
+++ b/host/lib/convert/avx2_fc32_to_sc16.cpp
@@ -0,0 +1,120 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* pack + swap 16-bit pairs */
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* Now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256 tmplo = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmphi = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+
+        /* convert and scale */
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));
+
+        /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */
+        __m256i shuffled_lo = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */
+        __m256i shuffled_hi = _mm256_permute2x128_si256(
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */
+
+        /* Now pack the shuffled data sequentially */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert any remaining samples
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc32_to_sc8.cpp b/host/lib/convert/avx2_fc32_to_sc8.cpp
new file mode 100644
index 0000000000..bf80423eab
--- /dev/null
+++ b/host/lib/convert/avx2_fc32_to_sc8.cpp
@@ -0,0 +1,98 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+template <const int shuf>
+UHD_INLINE __m256i pack_sc32_4x(const __m256& in0,
+    const __m256& in1,
+    const __m256& in2,
+    const __m256& in3,
+    const __m256& scalar)
+{
+    __m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar));
+    tmpi0         = _mm256_shuffle_epi32(tmpi0, shuf);
+    __m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar));
+    tmpi1         = _mm256_shuffle_epi32(tmpi1, shuf);
+
+    __m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20);
+    __m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31);
+    const __m256i lo   = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi);
+
+    __m256i tmpi2 = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar));
+    tmpi2         = _mm256_shuffle_epi32(tmpi2, shuf);
+    __m256i tmpi3 = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar));
+    tmpi3         = _mm256_shuffle_epi32(tmpi3, shuf);
+
+    __m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20);
+    __m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31);
+    const __m256i hi   = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi);
+
+    __m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20);
+    __m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31);
+
+    return _mm256_packs_epi16(shuf_lo, shuf_hi);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+        __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
+        __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));
+
+        /* convert */
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256 tmp0 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 0));
+        __m256 tmp1 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 4));
+        __m256 tmp2 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 8));
+        __m256 tmp3 = _mm256_loadu_ps(reinterpret_cast<const float*>(input + i + 12));
+
+        /* convert */
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc64_to_sc16.cpp b/host/lib/convert/avx2_fc64_to_sc16.cpp
new file mode 100644
index 0000000000..5794b3c9e2
--- /dev/null
+++ b/host/lib/convert/avx2_fc64_to_sc16.cpp
@@ -0,0 +1,139 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 0));
+        __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 2));
+        __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 4));
+        __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 6));
+
+        /* convert and scale */
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));
+
+        /* Unpack and interleave the results */
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);
+
+        /* Pack and swap 16-bit pairs */
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);
+
+        /* pack + swap 16-bit pairs */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* pack + swap 16-bit pairs */
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 0));
+        __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 2));
+        __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 4));
+        __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 6));
+
+        /* convert and scale */
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));
+
+        /* Unpack and interleave the results */
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);
+
+        /* Pack and swap 16-bit pairs */
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);
+
+        /* pack + swap 16-bit pairs */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc16_chdr, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 0));
+        __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 2));
+        __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 4));
+        __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 6));
+
+        /* convert and scale */
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));
+
+        /* Unpack and interleave the results */
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);
+
+        /* Pack and swap 16-bit pairs */
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);
+
+        /* pack + swap 16-bit pairs */
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);
+    }
+
+    // convert remainder
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc64_to_sc8.cpp b/host/lib/convert/avx2_fc64_to_sc8.cpp
new file mode 100644
index 0000000000..ba320f3eb5
--- /dev/null
+++ b/host/lib/convert/avx2_fc64_to_sc8.cpp
@@ -0,0 +1,103 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+UHD_INLINE __m256i pack_sc8_item32_4x(
+    const __m256i& in0, const __m256i& in1, const __m256i& in2, const __m256i& in3)
+{
+    const __m256i shuffled_in0_lo = _mm256_permute2x128_si256(in0, in1, 0x20);
+    const __m256i shuffled_in0_hi = _mm256_permute2x128_si256(in0, in1, 0x31);
+    const __m256i shuffled_in1_lo = _mm256_permute2x128_si256(in2, in3, 0x20);
+    const __m256i shuffled_in1_hi = _mm256_permute2x128_si256(in2, in3, 0x31);
+
+    const __m256i lo = _mm256_packs_epi32(shuffled_in0_lo, shuffled_in0_hi);
+    const __m256i hi = _mm256_packs_epi32(shuffled_in1_lo, shuffled_in1_hi);
+    return _mm256_packs_epi16(lo, hi);
+}
+
+UHD_INLINE __m256i pack_sc32_4x(
+    const __m256d& lo, const __m256d& hi, const __m256d& scalar)
+{
+    const __m128i tmpi_lo = _mm256_cvttpd_epi32(_mm256_mul_pd(hi, scalar));
+    const __m128i tmpi_hi = _mm256_cvttpd_epi32(_mm256_mul_pd(lo, scalar));
+
+    return _mm256_set_m128i(tmpi_hi, tmpi_lo);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 0));
+        __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 2));
+        __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 4));
+        __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 6));
+        __m256d tmp4 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 8));
+        __m256d tmp5 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 10));
+        __m256d tmp6 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 12));
+        __m256d tmp7 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 14));
+
+        /* interleave */
+        const __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar),
+            pack_sc32_4x(tmp3, tmp2, scalar),
+            pack_sc32_4x(tmp5, tmp4, scalar),
+            pack_sc32_4x(tmp7, tmp6, scalar));
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {
+        /* load from input */
+        __m256d tmp0 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 0));
+        __m256d tmp1 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 2));
+        __m256d tmp2 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 4));
+        __m256d tmp3 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 6));
+        __m256d tmp4 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 8));
+        __m256d tmp5 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 10));
+        __m256d tmp6 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 12));
+        __m256d tmp7 = _mm256_loadu_pd(reinterpret_cast<const double*>(input + i + 14));
+
+        /* interleave */
+        __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar),
+            pack_sc32_4x(tmp2, tmp3, scalar),
+            pack_sc32_4x(tmp4, tmp5, scalar),
+            pack_sc32_4x(tmp6, tmp7, scalar));
+        tmpi         = _mm256_or_si256(
+            _mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); /*byteswap*/
+
+        /* store to output */
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_sc16_to_fc32.cpp b/host/lib/convert/avx2_sc16_to_fc32.cpp
new file mode 100644
index 0000000000..338f74253a
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_fc32.cpp
@@ -0,0 +1,112 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* swap 16-bit pairs: [imag, real] -> [real, imag] */
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* split into 128-bit halves and sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi));
+        __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1));
+
+        /* convert to float and scale */
+        __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar);
+        __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar);
+
+        /* store to output */
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), tmplo);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), tmphi);
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* byteswap within each 16-bit word */
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));
+
+        /* split into 128-bit halves and sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi));
+        __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1));
+
+        /* convert to float and scale */
+        __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar);
+        __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar);
+
+        /* store to output */
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), tmplo);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), tmphi);
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_chdr, 1, fc32, 1, PRIORITY_SIMD_AVX2)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    fc32_t* output      = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load 8 complex samples as 2x 128-bit halves */
+        __m128i in_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));
+        __m128i in_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i + 4));
+
+        /* sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(in_lo);
+        __m256i int32_hi = _mm256_cvtepi16_epi32(in_hi);
+
+        /* convert to float and scale */
+        __m256 float_lo = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_lo), scalar);
+        __m256 float_hi = _mm256_mul_ps(_mm256_cvtepi32_ps(int32_hi), scalar);
+
+        /* store to output */
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), float_lo);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), float_hi);
+    }
+
+    // convert any remaining samples
+    for (; i < nsamps; i++) {
+        output[i] = fc32_t(float(input[i].real()) * float(scale_factor),
+            float(input[i].imag()) * float(scale_factor));
+    }
+}
diff --git a/host/lib/convert/avx2_sc16_to_fc64.cpp b/host/lib/convert/avx2_sc16_to_fc64.cpp
new file mode 100644
index 0000000000..4841089dd1
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_fc64.cpp
@@ -0,0 +1,121 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* swap 16-bit pairs: [imag, real] -> [real, imag] */
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi));
+        __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1));
+
+        /* convert to double and scale */
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar);
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar);
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar);
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar);
+
+        /* store to output */
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);
+    }
+
+    // convert remainder
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* byteswap within each 16-bit word */
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8));
+
+        /* sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(tmpi));
+        __m256i int32_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(tmpi, 1));
+
+        /* convert to double and scale */
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar);
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar);
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar);
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar);
+
+        /* store to output */
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);
+    }
+
+    // convert remainder
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_chdr, 1, fc64, 1, PRIORITY_SIMD_AVX2)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    fc64_t* output      = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        /* load 8 complex samples as 2x 128-bit halves */
+        __m128i in_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i));
+        __m128i in_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(input + i + 4));
+
+        /* sign-extend int16 to int32 */
+        __m256i int32_lo = _mm256_cvtepi16_epi32(in_lo);
+        __m256i int32_hi = _mm256_cvtepi16_epi32(in_hi);
+
+        /* convert to double and scale */
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_lo)), scalar);
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_lo, 1)), scalar);
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(int32_hi)), scalar);
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extracti128_si256(int32_hi, 1)), scalar);
+
+        /* store to output */
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);
+    }
+
+    // convert remainder
+    chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_sc16_to_sc16.cpp b/host/lib/convert/avx2_sc16_to_sc16.cpp
new file mode 100644
index 0000000000..76192e3eb4
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_sc16.cpp
@@ -0,0 +1,113 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD_AVX2)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        __m256i m0;
+
+        /* load from input */
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i));
+
+        /* swap 16-bit pairs */
+        m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));
+        m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* store to output */
+        _mm256_storeu_si256((__m256i*)(output + i), m0);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD_AVX2)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        __m256i m0, m1, m2;
+
+        /* load from input */
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i));
+
+        /* byteswap 16 bit words */
+        m1 = _mm256_srli_epi16(m0, 8);
+        m2 = _mm256_slli_epi16(m0, 8);
+        m0 = _mm256_or_si256(m1, m2);
+
+        /* store to output */
+        _mm256_storeu_si256((__m256i*)(output + i), m0);
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        __m256i m0;
+
+        /* load from input */
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i));
+
+        /* swap 16-bit pairs */
+        m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));
+        m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));
+
+        /* store to output */
+        _mm256_storeu_si256((__m256i*)(output + i), m0);
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    for (; i + 7 < nsamps; i += 8) {
+        __m256i m0, m1, m2;
+
+        /* load from input */
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i));
+
+        /* byteswap 16 bit words */
+        m1 = _mm256_srli_epi16(m0, 8);
+        m2 = _mm256_slli_epi16(m0, 8);
+        m0 = _mm256_or_si256(m1, m2);
+
+        /* store to output */
+        _mm256_storeu_si256((__m256i*)(output + i), m0);
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
+}
diff --git a/host/lib/convert/avx2_sc8_to_fc32.cpp b/host/lib/convert/avx2_sc8_to_fc32.cpp
new file mode 100644
index 0000000000..50b9befb7a
--- /dev/null
+++ b/host/lib/convert/avx2_sc8_to_fc32.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright 2026 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+static const __m256i zeroi = _mm256_setzero_si256();
+
+template <const int shuf>
+UHD_INLINE void unpack_sc32_4x(const __m256i& in,
+    __m256& out0,
+    __m256& out1,
+    __m256& out2,
+    __m256& out3,
+    const __m256& scalar)
+{
+    const __m256i tmplo = _mm256_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */
+    __m256i tmp0        = _mm256_shuffle_epi32(
+        _mm256_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */
+    __m256i tmp1 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmplo), shuf);
+    out0         = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp0), scalar);
+    out1         = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp1), scalar);
+
+    const __m256i tmphi = _mm256_unpackhi_epi8(zeroi, in);
+    __m256i tmp2        = _mm256_shuffle_epi32(_mm256_unpacklo_epi16(zeroi, tmphi), shuf);
+    __m256i tmp3        = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmphi), shuf);
+    out2                = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp2), scalar);
+    out3                = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp3), scalar);
+}
+
+DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24));
+    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);
+
+    size_t i = 0, j = 0;
+    size_t num_samps = nsamps;
+
+    if ((size_t(inputs[0]) & 0x3) != 0) {
+        item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);
+        num_samps--;
+    }
+
+    for (; j + 15 < num_samps; j += 16, i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* unpack + swap 8-bit pairs */
+        __m256 tmp0, tmp1, tmp2, tmp3;
+        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 0), tmp0);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 4), tmp1);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 8), tmp2);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 12), tmp3);
+    }
+
+    // convert remainder
+    item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);
+}
+
+DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD_AVX2)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24));
+    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);
+
+    size_t i = 0, j = 0;
+    size_t num_samps = nsamps;
+
+    if ((size_t(inputs[0]) & 0x3) != 0) {
+        item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);
+        num_samps--;
+    }
+
+    for (; j + 15 < num_samps; j += 16, i += 8) {
+        /* load from input */
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i));
+
+        /* unpack + swap 8-bit pairs */
+        __m256 tmp0, tmp1, tmp2, tmp3;
+        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);
+
+        /* store to output */
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 0), tmp0);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 4), tmp1);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 8), tmp2);
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 12), tmp3);
+    }
+
+    // convert remainder
+    item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);
+}
diff --git a/host/lib/convert/convert_common.hpp b/host/lib/convert/convert_common.hpp
index 97c0351329..a81ece72fb 100644
--- a/host/lib/convert/convert_common.hpp
+++ b/host/lib/convert/convert_common.hpp
@@ -64,6 +64,17 @@
 
 /***********************************************************************
  * Setup priorities
+ *
+ * Higher priority = preferred implementation
+ * When get_converter() is called with priority=-1, it returns the
+ * highest priority converter available.
+ *
+ * Priority hierarchy:
+ *   PRIORITY_EMPTY   = -1  (empty/null converter)
+ *   PRIORITY_GENERAL =  0  (generic C++ implementation)
+ *   PRIORITY_TABLE   =  1  (table lookup)
+ *   PRIORITY_SIMD    =  3  (SSE2/NEON - baseline SIMD)
+ *   PRIORITY_SIMD_AVX2   = 4  (AVX2 - 256-bit SIMD)
  **********************************************************************/
 static const int PRIORITY_GENERAL = 0;
 static const int PRIORITY_EMPTY   = -1;
@@ -72,10 +83,12 @@ static const int PRIORITY_EMPTY   = -1;
 static const int PRIORITY_SIMD = 2;
 static const int PRIORITY_TABLE =
     1; // tables require large cache, so they are slower on arm
+static const int PRIORITY_SIMD_AVX2 = 2; // Not applicable on ARM
 #else
 // We used to have ORC, too, so SIMD is 3
-static const int PRIORITY_SIMD  = 3;
-static const int PRIORITY_TABLE = 1;
+static const int PRIORITY_SIMD      = 3;
+static const int PRIORITY_TABLE     = 1;
+static const int PRIORITY_SIMD_AVX2 = 4;
 #endif
 
 /***********************************************************************
diff --git a/host/tests/convert_test.cpp b/host/tests/convert_test.cpp
index cf6c6a00ed..40fb448601 100644
--- a/host/tests/convert_test.cpp
+++ b/host/tests/convert_test.cpp
@@ -43,7 +43,7 @@ struct benchmark_result
 
 // List of priority types. This must be manually kept in sync with whatever is
 // defined in convert_common.hpp
-const std::array<uhd::convert::priority_type, 5> CONV_PRIO_TYPES{-1, 0, 1, 2, 3};
+const std::array<uhd::convert::priority_type, 6> CONV_PRIO_TYPES{-1, 0, 1, 2, 3, 4};
 
 // Use this to create a converter with fixed prio in a test case. If prio does
 // not exist, we simply exit the test case. That's normal.
diff --git a/host/utils/converter_benchmark.cpp b/host/utils/converter_benchmark.cpp
index e02329e017..9580d1e9ab 100644
--- a/host/utils/converter_benchmark.cpp
+++ b/host/utils/converter_benchmark.cpp
@@ -14,11 +14,14 @@
 #include <boost/format.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/program_options.hpp>
+#include <algorithm>
 #include <chrono>
 #include <complex>
 #include <iomanip>
 #include <iostream>
 #include <map>
+#include <sstream>
+#include <vector>
 
 namespace po = boost::program_options;
 using namespace uhd::convert;
@@ -261,6 +264,240 @@ std::string item_to_string(
     }
 }
 
+// For batch of benchmarks
+std::string get_priority_name(int prio)
+{
+    switch (prio) {
+        case 0:
+            return "Generic";
+        case 1:
+            return "Unrolled";
+        case 2:
+            return "NEON";
+        case 3:
+            return "SSE2/SSSE3";
+        case 4:
+            return "AVX2";
+        default:
+            return "Unknown(" + std::to_string(prio) + ")";
+    }
+}
+
+struct ConverterConfig
+{
+    std::string input_format;
+    std::string output_format;
+    double scale_factor;
+    std::string description;
+};
+
+std::vector<ConverterConfig> get_converter_configs()
+{
+    return {
+        // sc16 <-> fc32
+        {"sc16_item32_le", "fc32", 1.0 / 32768.0, "Wire LE to float"},
+        {"fc32", "sc16_item32_le", 32768.0, "Float to wire LE"},
+        {"sc16_item32_be", "fc32", 1.0 / 32768.0, "Wire BE to float"},
+        {"fc32", "sc16_item32_be", 32768.0, "Float to wire BE"},
+        {"sc16_chdr", "fc32", 1.0 / 32768.0, "CHDR to float"},
+        {"fc32", "sc16_chdr", 32768.0, "Float to CHDR"},
+
+        // sc16 <-> fc64
+        {"sc16_item32_le", "fc64", 1.0 / 32768.0, "Wire LE to double"},
+        {"fc64", "sc16_item32_le", 32768.0, "Double to wire LE"},
+        {"sc16_chdr", "fc64", 1.0 / 32768.0, "CHDR to double"},
+        {"fc64", "sc16_chdr", 32768.0, "Double to CHDR"},
+
+        // sc8 <-> fc32
+        {"sc8_item32_le", "fc32", 1.0 / 128.0, "8-bit wire to float"},
+        {"fc32", "sc8_item32_le", 128.0, "Float to 8-bit wire"},
+
+        // sc16 passthrough
+        {"sc16_item32_le", "sc16", 1.0, "Wire LE to native sc16"},
+        {"sc16", "sc16_item32_le", 1.0, "Native sc16 to wire LE"},
+        {"sc16_item32_be", "sc16", 1.0, "Wire BE to native sc16"},
+        {"sc16", "sc16_item32_be", 1.0, "Native sc16 to wire BE"},
+
+        // sc12
+        {"sc12_item32_le", "sc16", 1.0, "12-bit to sc16"},
+        {"sc16", "sc12_item32_le", 1.0, "sc16 to 12-bit"},
+        {"sc12_item32_le", "fc32", 1.0 / 2048.0, "12-bit to float"},
+        {"fc32", "sc12_item32_le", 2048.0, "Float to 12-bit"},
+    };
+}
+
+struct BenchmarkResult
+{
+    id_type id;
+    int priority;
+    std::string priority_name;
+    size_t buffer_size;
+    double ns_per_sample;
+    double samples_per_sec;
+    double throughput_gbps;
+    size_t bytes_per_sample;
+};
+
+BenchmarkResult run_batch_benchmark(id_type id,
+    int prio,
+    size_t buffer_size,
+    size_t iterations,
+    double scale_factor = 1.0)
+{
+    BenchmarkResult result;
+    result.id            = id;
+    result.priority      = prio;
+    result.priority_name = get_priority_name(prio);
+    result.buffer_size   = buffer_size;
+
+    converter::sptr conv;
+    try {
+        conv = get_converter(id, prio)();
+    } catch (uhd::key_error&) {
+        result.ns_per_sample   = -1;
+        result.samples_per_sec = 0;
+        result.throughput_gbps = 0;
+        return result;
+    }
+
+    conv->set_scalar(scale_factor);
+
+    const size_t alignment  = 64;
+    const size_t alloc_size = buffer_size * 16 + alignment;
+    std::vector<uint8_t> input_storage(alloc_size);
+    std::vector<uint8_t> output_storage(alloc_size);
+
+    void* input_ptr = reinterpret_cast<void*>(
+        (reinterpret_cast<size_t>(input_storage.data()) + alignment - 1)
+        & ~(alignment - 1));
+    void* output_ptr = reinterpret_cast<void*>(
+        (reinterpret_cast<size_t>(output_storage.data()) + alignment - 1)
+        & ~(alignment - 1));
+
+    // Initialize input with deterministic data
+    uint32_t* input_u32 = static_cast<uint32_t*>(input_ptr);
+    for (size_t i = 0; i < buffer_size * 4; i++) {
+        input_u32[i] = static_cast<uint32_t>(i * 12345 + 67890);
+    }
+
+    std::vector<const void*> input_buf_refs(1, input_ptr);
+    std::vector<void*> output_buf_refs(1, output_ptr);
+
+    // Warm-up runs
+    conv->conv(input_buf_refs, output_buf_refs, buffer_size);
+    conv->conv(input_buf_refs, output_buf_refs, buffer_size);
+
+    // Benchmark
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t iter = 0; iter < iterations; iter++) {
+        conv->conv(input_buf_refs, output_buf_refs, buffer_size);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    double elapsed_ns =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+    double total_samples = static_cast<double>(buffer_size) * iterations;
+
+    result.ns_per_sample    = elapsed_ns / total_samples;
+    result.samples_per_sec  = total_samples / (elapsed_ns * 1e-9);
+    const std::string in_type  = format_to_type(id.input_format);
+    const std::string out_type = format_to_type(id.output_format);
+    result.bytes_per_sample =
+        get_bytes_per_item(in_type) + get_bytes_per_item(out_type);
+    result.throughput_gbps =
+        (result.samples_per_sec * result.bytes_per_sample * 8) / 1e9;
+
+    return result;
+}
+
+void print_results_table(const std::vector<BenchmarkResult>& results,
+    const std::string& title,
+    bool show_throughput = false)
+{
+    std::cout << "\n" << std::string(80, '=') << "\n";
+    std::cout << title << "\n";
+    std::cout << std::string(80, '=') << "\n";
+
+    // Group by conversion type
+    std::map<std::string, std::vector<BenchmarkResult>> grouped;
+    for (const auto& r : results) {
+        std::string key = r.id.input_format + " -> " + r.id.output_format;
+        grouped[key].push_back(r);
+    }
+
+    for (const auto& kv : grouped) {
+        std::cout << "\n" << kv.first << ":\n";
+        std::cout << std::string(70, '-') << "\n";
+
+        if (show_throughput) {
+            std::cout << std::setw(12) << "Buffer Size" << std::setw(15) << "Priority"
+                      << std::setw(15) << "ns/sample" << std::setw(15) << "MSamples/s"
+                      << std::setw(12) << "Gbps" << "\n";
+        } else {
+            std::cout << std::setw(12) << "Buffer Size" << std::setw(15) << "Priority"
+                      << std::setw(15) << "ns/sample" << std::setw(15) << "MSamples/s"
+                      << "\n";
+        }
+        std::cout << std::string(70, '-') << "\n";
+
+        for (const auto& r : kv.second) {
+            if (r.ns_per_sample < 0) {
+                std::cout << std::setw(12) << r.buffer_size << std::setw(15)
+                          << r.priority_name << std::setw(15) << "N/A" << std::setw(15)
+                          << "N/A" << "\n";
+            } else {
+                std::cout << std::setw(12) << r.buffer_size << std::setw(15)
+                          << r.priority_name << std::setw(15) << std::fixed
+                          << std::setprecision(3) << r.ns_per_sample << std::setw(15)
+                          << std::setprecision(2) << r.samples_per_sec / 1e6;
+                if (show_throughput) {
+                    std::cout << std::setw(12) << std::setprecision(2)
+                              << r.throughput_gbps;
+                }
+                std::cout << "\n";
+            }
+        }
+    }
+}
+
+void print_comparison_table(const std::vector<BenchmarkResult>& results,
+    const std::string& conversion,
+    size_t buffer_size)
+{
+    std::cout << "\n" << conversion << " @ " << buffer_size << " samples:\n";
+    std::cout << std::string(60, '-') << "\n";
+    std::cout << std::setw(15) << "Priority" << std::setw(15) << "ns/sample"
+              << std::setw(15) << "MSamples/s" << std::setw(15) << "Speedup" << "\n";
+    std::cout << std::string(60, '-') << "\n";
+
+    // Find baseline (Generic, prio 0)
+    double baseline_ns = 0;
+    for (const auto& r : results) {
+        if (r.priority == 0 && r.ns_per_sample > 0) {
+            baseline_ns = r.ns_per_sample;
+            break;
+        }
+    }
+
+    for (const auto& r : results) {
+        if (r.ns_per_sample < 0) {
+            std::cout << std::setw(15) << r.priority_name << std::setw(15) << "N/A"
+                      << std::setw(15) << "N/A" << std::setw(15) << "N/A" << "\n";
+        } else if (baseline_ns <= 0) {
+            std::cout << std::setw(15) << r.priority_name << std::setw(15) << std::fixed
+                      << std::setprecision(3) << r.ns_per_sample << std::setw(15)
+                      << std::setprecision(2) << r.samples_per_sec / 1e6 << std::setw(15)
+                      << "N/A" << "\n";
+        } else {
+            double speedup = baseline_ns / r.ns_per_sample;
+            std::cout << std::setw(15) << r.priority_name << std::setw(15) << std::fixed
+                      << std::setprecision(3) << r.ns_per_sample << std::setw(15)
+                      << std::setprecision(2) << r.samples_per_sec / 1e6 << std::setw(14)
+                      << std::setprecision(2) << speedup << "x" << "\n";
+        }
+    }
+}
+
 int UHD_SAFE_MAIN(int argc, char* argv[])
 {
     std::string in_format, out_format;
@@ -281,12 +518,18 @@ int UHD_SAFE_MAIN(int argc, char* argv[])
         ("samples",  po::value<size_t>(&n_samples)->default_value(1000000), "Number of samples per iteration")
         ("iterations",  po::value<size_t>(&iterations)->default_value(10000), "Number of iterations per benchmark")
         ("priorities", po::value<std::string>(&priorities)->default_value("default"), "Converter priorities. Can be 'default', 'all', or a comma-separated list of priorities.")
-        ("max-prio", po::value<priority_type>(&max_prio)->default_value(4), "Largest available priority (advanced feature)")
+        ("max-prio", po::value<priority_type>(&max_prio)->default_value(5), "Largest available priority (advanced feature)")
         ("n-inputs",   po::value<size_t>(&n_inputs)->default_value(1),  "Number of input vectors")
         ("n-outputs",  po::value<size_t>(&n_outputs)->default_value(1), "Number of output vectors")
         ("debug-converter", "Skip benchmark and print conversion results. Implies iterations==1 and will only run on a single converter.")
         ("seed-mode", po::value<std::string>(&seed_mode)->default_value("random"), "How to initialize the data: random, incremental")
         ("hex", "When using debug mode, dump memory in hex")
+        ("batch", "Run batch benchmark across all predefined converter configurations")
+        ("buffer-sizes", po::value<std::string>()->default_value("64,256,1024,4096,16384,65536,262144"), "Buffer sizes for batch mode (comma-separated)")
+        ("compare", "Show comparison tables with speedup (batch mode)")
+        ("throughput", "Show throughput in Gbps (batch mode)")
+        ("csv", "Output batch results in CSV format")
+        ("quick", "Quick batch mode: fewer buffer sizes and iterations")
     ;
     // clang-format on
     po::variables_map vm;
@@ -309,6 +552,172 @@ int UHD_SAFE_MAIN(int argc, char* argv[])
         return EXIT_FAILURE;
     }
 
+    // Batch mode: run predefined converter configs across multiple buffer sizes
+    if (vm.count("batch")) {
+        std::vector<size_t> buffer_sizes;
+        std::string sizes_str = vm["buffer-sizes"].as<std::string>();
+        if (vm.count("quick")) {
+            sizes_str = "256,4096,65536";
+        }
+        {
+            std::stringstream ss(sizes_str);
+            std::string item;
+            while (std::getline(ss, item, ',')) {
+                buffer_sizes.push_back(std::stoull(item));
+            }
+        }
+
+        std::vector<int> batch_priorities;
+        {
+            // Use max_prio to determine which priorities to test
+            for (int i = 0; i < static_cast<int>(max_prio); i++) {
+                batch_priorities.push_back(i);
+            }
+        }
+
+        size_t batch_iterations = iterations;
+        if (vm.count("quick")) {
+            batch_iterations = 20;
+        }
+
+        std::string filter;
+        bool show_comparison = vm.count("compare") > 0;
+        bool show_throughput = vm.count("throughput") > 0;
+        bool csv_output      = vm.count("csv") > 0;
+        auto converter_configs = get_converter_configs();
+
+        std::cout << "========================================\n";
+        std::cout << "UHD Converter Batch Benchmark\n";
+        std::cout << "========================================\n";
+        std::cout << "Buffer sizes: ";
+        for (auto s : buffer_sizes)
+            std::cout << s << " ";
+        std::cout << "\n";
+        std::cout << "Iterations: " << batch_iterations << "\n";
+        std::cout << "Priorities: ";
+        for (auto p : batch_priorities)
+            std::cout << p << " ";
+        std::cout << "\n\n";
+
+        std::vector<BenchmarkResult> all_results;
+
+        if (csv_output) {
+            std::cout << "input_format,output_format,buffer_size,priority,"
+                         "priority_name,ns_per_sample,msamples_per_sec,gbps\n";
+        }
+
+        // Run benchmarks
+        for (const auto& config : converter_configs) {
+            id_type bid;
+            bid.input_format  = config.input_format;
+            bid.num_inputs    = 1;
+            bid.output_format = config.output_format;
+            bid.num_outputs   = 1;
+
+            std::string conv_name =
+                config.input_format + " -> " + config.output_format;
+
+            if (!csv_output) {
+                std::cout << "Benchmarking: " << conv_name << " ("
+                          << config.description << ")...\n";
+            }
+
+            std::vector<BenchmarkResult> conv_results;
+
+            for (size_t buf_size : buffer_sizes) {
+                for (int bprio : batch_priorities) {
+                    auto result = run_batch_benchmark(
+                        bid, bprio, buf_size, batch_iterations, config.scale_factor);
+
+                    if (csv_output && result.ns_per_sample > 0) {
+                        std::cout << config.input_format << ","
+                                  << config.output_format << "," << buf_size
+                                  << "," << bprio << "," << result.priority_name
+                                  << "," << std::fixed << std::setprecision(3)
+                                  << result.ns_per_sample << ","
+                                  << std::setprecision(2)
+                                  << result.samples_per_sec / 1e6 << ","
+                                  << result.throughput_gbps << "\n";
+                    }
+
+                    all_results.push_back(result);
+                    conv_results.push_back(result);
+                }
+            }
+
+            if (show_comparison && !csv_output) {
+                for (size_t buf_size : buffer_sizes) {
+                    std::vector<BenchmarkResult> size_results;
+                    for (const auto& r : conv_results) {
+                        if (r.buffer_size == buf_size) {
+                            size_results.push_back(r);
+                        }
+                    }
+                    print_comparison_table(size_results, conv_name, buf_size);
+                }
+            }
+        }
+
+        // summary tables
+        if (!csv_output) {
+            print_results_table(all_results, "All Benchmark Results", show_throughput);
+
+            // Print fastest converter for each type at largest buffer size
+            std::cout << "\n" << std::string(80, '=') << "\n";
+            std::cout << "Summary: Fastest Converter for Each Type\n";
+            std::cout << std::string(80, '=') << "\n";
+            std::cout << std::setw(35) << "Conversion" << std::setw(15)
+                      << "Best Priority" << std::setw(15) << "ns/sample"
+                      << std::setw(15) << "Speedup vs Gen" << "\n";
+            std::cout << std::string(80, '-') << "\n";
+
+            size_t largest_buf =
+                *std::max_element(buffer_sizes.begin(), buffer_sizes.end());
+
+            // Group by conversion
+            std::map<std::string, std::vector<BenchmarkResult>> by_conv;
+            for (const auto& r : all_results) {
+                if (r.buffer_size == largest_buf && r.ns_per_sample > 0) {
+                    std::string key =
+                        r.id.input_format + " -> " + r.id.output_format;
+                    by_conv[key].push_back(r);
+                }
+            }
+
+            for (auto& kv : by_conv) {
+                auto& results = kv.second;
+                auto fastest  = std::min_element(results.begin(),
+                    results.end(),
+                    [](const BenchmarkResult& a, const BenchmarkResult& b) {
+                        return a.ns_per_sample < b.ns_per_sample;
+                    });
+
+                double generic_ns = 0;
+                for (const auto& r : results) {
+                    if (r.priority == 0 && r.ns_per_sample > 0) {
+                        generic_ns = r.ns_per_sample;
+                        break;
+                    }
+                }
+
+                std::cout << std::setw(35) << kv.first << std::setw(15)
+                          << fastest->priority_name << std::setw(15) << std::fixed
+                          << std::setprecision(3) << fastest->ns_per_sample;
+
+                if (generic_ns > 0) {
+                    double speedup = generic_ns / fastest->ns_per_sample;
+                    std::cout << std::setw(14) << std::setprecision(2) << speedup
+                              << "x";
+                } else {
+                    std::cout << std::setw(15) << "N/A";
+                }
+                std::cout << "\n";
+            }
+        }
+
+        return EXIT_SUCCESS;
+    }
+
     // Parse more arguments
     if (seed_mode == "incremental") {
         buf_seed_mode = INC;