From c5ffb612a0fe7ccf999fb1a0d67ad3410044da9e Mon Sep 17 00:00:00 2001
From: Joseph Hwang <joseph@motherduck.com>
Date: Sun, 22 Mar 2026 21:51:47 -0700
Subject: [PATCH 1/5] Various changes

---
 CMakeLists.txt                                |  31 ++-
 .../end_to_end/end_to_end_superkmeans.cpp     |   3 +-
 .../distance_computers/batch_computers.h      | 227 +++++++++++------
 include/superkmeans/superkmeans.h             | 229 +++++++++++++++++-
 4 files changed, 402 insertions(+), 88 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4090b8..3b666f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,20 @@ set(SKMEANS_COMPILE_BENCHMARKS OFF CACHE BOOL "Whether to compile benchmarks")
 set(SKMEANS_ENABLE_GPU OFF CACHE BOOL "Whether to use the GPU-based implementation of SuperKMeans")
 set(SKMEANS_COMPILE_EXAMPLES ON CACHE BOOL "Whether to compile examples")
 
-find_package(OpenMP REQUIRED)
+
+# Apple Clang does not bundle OpenMP; point FindOpenMP at Homebrew's libomp.
+if(APPLE)
+    execute_process(COMMAND brew --prefix libomp
+        OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+    if(HOMEBREW_LIBOMP_PREFIX)
+        set(OpenMP_CXX_FLAGS "-Xclang -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include"
+            CACHE STRING "" FORCE)
+        set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
+        set(OpenMP_omp_LIBRARY "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib"
+            CACHE FILEPATH "" FORCE)
+    endif()
+endif()
+find_package(OpenMP REQUIRED COMPONENTS CXX)
 
 list(PREPEND CMAKE_PREFIX_PATH /usr/local)
 
@@ -69,7 +82,19 @@ else()
     message(STATUS "GPU disabled")
 endif()
 
-add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}")
+set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(XNNPACK_BUILD_ALL_MICROKERNELS OFF CACHE BOOL "" FORCE)
+FetchContent_Declare(
+    xnnpack
+    GIT_REPOSITORY https://github.com/google/XNNPACK.git
+    GIT_TAG master
+    GIT_SHALLOW TRUE
+)
+FetchContent_MakeAvailable(xnnpack)
+set(XNNPACK_LINK_LIBRARIES XNNPACK pthreadpool)
+
+add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}" BENCHMARK_TIME)
 include_directories(include extern/Eigen)
 
 if(SKMEANS_COMPILE_TESTS)
@@ -120,6 +145,8 @@ if(Python_FOUND AND pybind11_FOUND)
         target_link_libraries(_superkmeans PRIVATE ${BLAS_LINK_LIBRARIES})
     endif()
 
+    target_link_libraries(_superkmeans PRIVATE ${XNNPACK_LINK_LIBRARIES})
+
     if(FFTW_FLOAT_LIB_FOUND)
         target_link_libraries(_superkmeans PRIVATE ${FFTW_FLOAT_LIB} ${FFTW_FLOAT_OPENMP_LIB})
     endif()
diff --git a/benchmarks/end_to_end/end_to_end_superkmeans.cpp b/benchmarks/end_to_end/end_to_end_superkmeans.cpp
index fc0b176..9b50099 100644
--- a/benchmarks/end_to_end/end_to_end_superkmeans.cpp
+++ b/benchmarks/end_to_end/end_to_end_superkmeans.cpp
@@ -67,13 +67,14 @@ int main(int argc, char* argv[]) {
 
     skmeans::SuperKMeansConfig config;
     config.iters = n_iters;
-    config.verbose = false;
+    config.verbose = true;
     config.n_threads = THREADS;
     config.objective_k = 100;
     config.ann_explore_fraction = 0.01f;
     config.unrotate_centroids = true;
     config.early_termination = false;
     config.sampling_fraction = sampling_fraction;
+    // TODO: For joseph experiment
     config.use_blas_only = false;
 
     auto is_angular = std::find(
diff --git a/include/superkmeans/distance_computers/batch_computers.h b/include/superkmeans/distance_computers/batch_computers.h
index 12ba03d..2d96ab0 100644
--- a/include/superkmeans/distance_computers/batch_computers.h
+++ b/include/superkmeans/distance_computers/batch_computers.h
@@ -2,34 +2,17 @@
 
 #include <cstdint>
 #include <cstdio>
+#include <limits>
+#include <vector>
 #include <omp.h>
 
 #include "superkmeans/common.h"
 #include "superkmeans/distance_computers/base_computers.h"
+#include "superkmeans/distance_computers/matmul.h"
 #include "superkmeans/pdx/layout.h"
 #include "superkmeans/profiler.h"
 #include <Eigen/Dense>
 
-// Eigen already declares sgemm_, so we don't need to redeclare it
-// TODO(lkuffo, low): However, I would like to have more control over this
-extern "C" {
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-// int sgemm_(
-//         const char* transa,
-//         const char* transb,
-//         FINTEGER* m,
-//         FINTEGER* n,
-//         FINTEGER* k,
-//         const float* alpha,
-//         const float* a,
-//         FINTEGER* lda,
-//         const float* b,
-//         FINTEGER* ldb,
-//         float* beta,
-//         float* c,
-//         FINTEGER* ldc);
-}
-
 namespace skmeans {
 
 template <DistanceFunction alpha, Quantization q>
@@ -38,6 +21,145 @@ class BatchComputer {};
 template <>
 class BatchComputer<DistanceFunction::l2, Quantization::u8> {};
 
+/**
+ * @brief Finds the top-1 nearest neighbor for each query vector (int8 data).
+ *
+ * Computes L2 distances between X and Y using int8 dot products via XNNPACK:
+ *   ||x-y||² = ||x||² + ||y||² - 2*x·y
+ *
+ * @param x Query vectors in row-major layout (n_x × d), int8
+ * @param y Reference vectors in row-major layout (n_y × d), int8
+ * @param n_x Number of query vectors
+ * @param n_y Number of reference vectors
+ * @param d Dimensionality
+ * @param norms_x Pre-computed squared L2 norms of query vectors (float)
+ * @param norms_y Pre-computed squared L2 norms of reference vectors (float)
+ * @param out_knn Output: index of nearest neighbor for each query
+ * @param out_distances Output: distance to nearest neighbor for each query
+ * @param tmp_distances_buf Buffer for batch distance computation (size: X_BATCH_SIZE × Y_BATCH_SIZE)
+ */
+/**
+ * @brief Finds the top-k nearest neighbors for each query vector (int8 data).
+ *
+ * Computes L2 distances between X and Y using int8 dot products via XNNPACK:
+ *   ||x-y||² = ||x||² + ||y||² - 2*x·y
+ * For k=1, uses vectorized minCoeff. For k>1, maintains a sorted top-k array
+ * per query with threshold-based rejection.
+ *
+ * @param x Query vectors in row-major layout (n_x × d), int8
+ * @param y Reference vectors in row-major layout (n_y × d), int8
+ * @param n_x Number of query vectors
+ * @param n_y Number of reference vectors
+ * @param d Dimensionality
+ * @param norms_x Pre-computed squared L2 norms of query vectors (float)
+ * @param norms_y Pre-computed squared L2 norms of reference vectors (float)
+ * @param k Number of nearest neighbors to find
+ * @param out_knn Output: indices of k nearest neighbors per query (size: n_x × k)
+ * @param out_distances Output: distances to k nearest neighbors per query (size: n_x × k)
+ * @param tmp_distances_buf Buffer for batch distance computation (size: X_BATCH_SIZE × Y_BATCH_SIZE)
+ */
+inline void FindKNearestNeighborsI8(
+    const int8_t* SKM_RESTRICT x,
+    const int8_t* SKM_RESTRICT y,
+    const size_t n_x,
+    const size_t n_y,
+    const size_t d,
+    const float* SKM_RESTRICT norms_x,
+    const float* SKM_RESTRICT norms_y,
+    const size_t k,
+    uint32_t* SKM_RESTRICT out_knn,
+    float* SKM_RESTRICT out_distances,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    using MatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    matmul::InitXnnpack();
+    SKM_PROFILE_SCOPE("blas_i8_knn");
+    std::fill_n(out_distances, n_x * k, std::numeric_limits<float>::max());
+    std::fill_n(out_knn, n_x * k, static_cast<uint32_t>(-1));
+
+    for (size_t i = 0; i < n_x; i += X_BATCH_SIZE) {
+        auto batch_n_x = X_BATCH_SIZE;
+        auto batch_x_p = x + (i * d);
+        if (i + X_BATCH_SIZE > n_x) {
+            batch_n_x = n_x - i;
+        }
+        for (size_t j = 0; j < n_y; j += Y_BATCH_SIZE) {
+            auto batch_n_y = Y_BATCH_SIZE;
+            auto batch_y_p = y + (j * d);
+            if (j + Y_BATCH_SIZE > n_y) {
+                batch_n_y = n_y - j;
+            }
+            {
+                SKM_PROFILE_SCOPE("blas_i8_knn/gemm");
+                matmul::XnnpackMatmulI8F32(
+                    batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf
+                );
+            }
+            Eigen::Map<MatrixR> distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y);
+
+            SKM_PROFILE_SCOPE("blas_i8_knn/top_k");
+#pragma omp parallel for num_threads(g_n_threads)
+            for (size_t r = 0; r < batch_n_x; ++r) {
+                const auto i_idx = i + r;
+                const float norm_x_i = norms_x[i_idx];
+                float* row_p = distances_matrix.data() + r * batch_n_y;
+#pragma clang loop vectorize(enable)
+                for (size_t c = 0; c < batch_n_y; ++c) {
+                    row_p[c] = -2.0f * row_p[c] + norm_x_i + norms_y[j + c];
+                }
+
+                float* out_dist_row = out_distances + i_idx * k;
+                uint32_t* out_knn_row = out_knn + i_idx * k;
+
+                if (k == 1) {
+                    uint32_t min_idx;
+                    auto min_dist = distances_matrix.row(r).minCoeff(&min_idx);
+                    if (min_dist < out_dist_row[0]) {
+                        out_dist_row[0] = std::max(0.0f, min_dist);
+                        out_knn_row[0] = static_cast<uint32_t>(j + min_idx);
+                    }
+                    continue;
+                }
+
+                float threshold = out_dist_row[k - 1];
+                for (size_t c = 0; c < batch_n_y; ++c) {
+                    float dist = std::max(0.0f, row_p[c]);
+                    if (dist >= threshold) continue;
+
+                    // Find where the new candidate belongs
+                    size_t pos = 0;
+                    for (; pos < k; ++pos) {
+                        if (dist < out_dist_row[pos]) break;
+                    }
+                    // Shift worse entries down by one (evicts the k-th)
+                    for (size_t ki = k - 1; ki > pos; --ki) {
+                        out_dist_row[ki] = out_dist_row[ki - 1];
+                        out_knn_row[ki] = out_knn_row[ki - 1];
+                    }
+                    out_dist_row[pos] = dist;
+                    out_knn_row[pos] = static_cast<uint32_t>(j + c);
+                    threshold = out_dist_row[k - 1];
+                }
+            }
+        }
+    }
+}
+
+inline void FindNearestNeighborI8(
+    const int8_t* SKM_RESTRICT x,
+    const int8_t* SKM_RESTRICT y,
+    const size_t n_x,
+    const size_t n_y,
+    const size_t d,
+    const float* SKM_RESTRICT norms_x,
+    const float* SKM_RESTRICT norms_y,
+    uint32_t* SKM_RESTRICT out_knn,
+    float* SKM_RESTRICT out_distances,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    FindKNearestNeighborsI8(x, y, n_x, n_y, d, norms_x, norms_y, 1, out_knn, out_distances, tmp_distances_buf);
+}
+
 template <>
 class BatchComputer<DistanceFunction::l2, Quantization::f32> {
 
@@ -49,59 +171,6 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
     using MatrixR = Eigen::Matrix<distance_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     using MatrixC = Eigen::Matrix<distance_t, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
 
-  private:
-    /**
-     * @brief Performs BLAS matrix multiplication: distances = x * y^T
-     *
-     * Computes the dot product matrix between X and Y
-     * Can optionally use only the first partial_d dimensions for partial distance computation.
-     *
-     * @param batch_x_p Pointer to query vectors batch (batch_n_x × d)
-     * @param batch_y_p Pointer to reference vectors batch (batch_n_y × d)
-     * @param batch_n_x Number of query vectors in batch
-     * @param batch_n_y Number of reference vectors in batch
-     * @param d Full dimensionality
-     * @param partial_d Number of dimensions to use (if 0, uses all dimensions)
-     * @param tmp_distances_buf Output buffer for distance matrix (batch_n_x × batch_n_y)
-     */
-    static void BlasMatrixMultiplication(
-        const data_t* SKM_RESTRICT batch_x_p,
-        const data_t* SKM_RESTRICT batch_y_p,
-        const size_t batch_n_x,
-        const size_t batch_n_y,
-        const size_t d,
-        const size_t partial_d,
-        float* SKM_RESTRICT tmp_distances_buf
-    ) {
-        const char trans_a = 'T';
-        const char trans_b = 'N';
-
-        int m = static_cast<int>(batch_n_y);
-        int n = static_cast<int>(batch_n_x);
-        int k = static_cast<int>(partial_d > 0 && partial_d < d ? partial_d : d);
-        float alpha = 1.0f;
-        float beta = 0.0f;
-        int lda = static_cast<int>(d);         // d of y (row stride in row-major)
-        int ldb = static_cast<int>(d);         // d of x (row stride in row-major)
-        int ldc = static_cast<int>(batch_n_y); // Leading dimension of tmp_distances_buf
-
-        sgemm_(
-            &trans_a,
-            &trans_b,
-            &m,
-            &n,
-            &k,
-            &alpha,
-            batch_y_p,
-            &lda,
-            batch_x_p,
-            &ldb,
-            &beta,
-            tmp_distances_buf,
-            &ldc
-        );
-    }
-
   public:
     /**
      * @brief Finds the top-1 nearest neighbor for each query vector.
@@ -133,8 +202,6 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
         distance_t* SKM_RESTRICT out_distances,
         float* SKM_RESTRICT tmp_distances_buf
     ) {
-        SKM_PROFILE_SCOPE("search");
-        SKM_PROFILE_SCOPE("search/1st_blas");
         std::fill_n(out_distances, n_x, std::numeric_limits<distance_t>::max());
         for (size_t i = 0; i < n_x; i += X_BATCH_SIZE) {
             auto batch_n_x = X_BATCH_SIZE;
@@ -154,7 +221,7 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
 #pragma omp parallel for num_threads(g_n_threads) schedule(static)
                 for (size_t r = 0; r < batch_n_x; r += MINI_BATCH_SIZE) {
                     auto mini_batch_n_x = std::min(MINI_BATCH_SIZE, batch_n_x - r);
-                    BlasMatrixMultiplication(
+                    matmul::MatrixMultiplication(
                         batch_x_p + r * d,
                         batch_y_p,
                         mini_batch_n_x,
@@ -165,7 +232,7 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
                     );
                 }
 #else
-                BlasMatrixMultiplication(
+                matmul::MatrixMultiplication(
                     batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf
                 );
 #endif
@@ -245,7 +312,7 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
                     batch_n_y = n_y - j;
                 }
 
-                BlasMatrixMultiplication(
+                matmul::MatrixMultiplication(
                     batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf
                 );
                 Eigen::Map<MatrixR> distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y);
@@ -351,7 +418,7 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
 #pragma omp parallel for num_threads(g_n_threads) schedule(static)
                     for (size_t r = 0; r < batch_n_x; r += MINI_BATCH_SIZE) {
                         auto mini_batch_n_x = std::min(MINI_BATCH_SIZE, batch_n_x - r);
-                        BlasMatrixMultiplication(
+                        matmul::MatrixMultiplication(
                             batch_x_p + r * d,
                             batch_y_p,
                             mini_batch_n_x,
@@ -362,7 +429,7 @@ class BatchComputer<DistanceFunction::l2, Quantization::f32> {
                         );
                     }
 #else
-                    BlasMatrixMultiplication(
+                    matmul::MatrixMultiplication(
                         batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf
                     );
 #endif
diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h
index f0cc253..b0e7faf 100644
--- a/include/superkmeans/superkmeans.h
+++ b/include/superkmeans/superkmeans.h
@@ -661,6 +661,104 @@ class SuperKMeans {
         }
     }
 
+    /**
+     * @brief Performs assignment using quantized (int8) distance computation.
+     *
+     * Quantizes data (once) and centroids, finds top-k candidates via i8 GEMM,
+     * then reranks with f32 distances when k > 1. Writes final assignments to
+     * assignments/distances members and zeros accumulators for UpdateCentroids.
+     *
+     * @param data Data matrix (row-major, n_samples × d), used for reranking
+     * @param centroids f32 centroids (row-major, n_clusters × d), used for quantization and reranking
+     * @param n_samples Number of vectors in the data
+     * @param n_clusters Number of centroids
+     */
+    void QuantizedAssignAndUpdateCentroids(
+        const vector_value_t* SKM_RESTRICT data,
+        const vector_value_t* SKM_RESTRICT centroids,
+        const size_t n_samples,
+        const size_t n_clusters,
+        uint32_t* SKM_RESTRICT out_assignments,
+        distance_t* SKM_RESTRICT out_distances
+    ) {
+        if (quantized_data.empty()) {
+            SKM_PROFILE_SCOPE("quantize_data");
+            quantized_data.resize(n_samples * d);
+            quantized_data_norms.reset(new vector_value_t[n_samples]);
+            QuantizeEmbeddings(data, n_samples, d, quantized_data.data());
+            GetQuantizedL2NormsRowMajor(quantized_data.data(), n_samples, quantized_data_norms.get());
+        }
+
+        std::vector<int8_t> quantized_centroids_buf;
+        std::unique_ptr<vector_value_t[]> quantized_centroid_norms;
+        {
+            SKM_PROFILE_SCOPE("quantize_centroids");
+            quantized_centroids_buf.resize(n_clusters * d);
+            quantized_centroid_norms.reset(new vector_value_t[n_clusters]);
+            QuantizeEmbeddings(centroids, n_clusters, d, quantized_centroids_buf.data());
+            GetQuantizedL2NormsRowMajor(quantized_centroids_buf.data(), n_clusters, quantized_centroid_norms.get());
+        }
+
+        const size_t i8_k = 2;
+        std::vector<uint32_t> i8_knn(n_samples * i8_k);
+        std::vector<float> i8_knn_distances(n_samples * i8_k);
+        std::vector<float> i8_tmp_distances_buf(X_BATCH_SIZE * Y_BATCH_SIZE);
+        {
+            SKM_PROFILE_SCOPE("assign_i8/knn");
+            FindKNearestNeighborsI8(
+                quantized_data.data(),
+                quantized_centroids_buf.data(),
+                n_samples,
+                n_clusters,
+                d,
+                quantized_data_norms.get(),
+                quantized_centroid_norms.get(),
+                i8_k,
+                i8_knn.data(),
+                i8_knn_distances.data(),
+                i8_tmp_distances_buf.data()
+            );
+        }
+
+        if (i8_k > 1) {
+            SKM_PROFILE_SCOPE("assign_i8/rerank");
+#pragma omp parallel for num_threads(g_n_threads)
+            for (size_t s = 0; s < n_samples; ++s) {
+                const float* __restrict__ point = data + s * d;
+                float best_dist = std::numeric_limits<float>::max();
+                uint32_t best_idx = 0;
+                for (size_t ki = 0; ki < i8_k; ++ki) {
+                    uint32_t c_idx = i8_knn[s * i8_k + ki];
+                    if (c_idx == static_cast<uint32_t>(-1)) break;
+                    const float* __restrict__ cent = centroids + c_idx * d;
+                    float dist = 0;
+#pragma omp simd reduction(+:dist)
+                    for (size_t dim = 0; dim < d; ++dim) {
+                        float diff = point[dim] - cent[dim];
+                        dist += diff * diff;
+                    }
+                    if (dist < best_dist) {
+                        best_dist = dist;
+                        best_idx = c_idx;
+                    }
+                }
+                out_assignments[s] = best_idx;
+                out_distances[s] = best_dist;
+            }
+        } else {
+            for (size_t s = 0; s < n_samples; ++s) {
+                out_assignments[s] = i8_knn[s];
+                out_distances[s] = i8_knn_distances[s];
+            }
+        }
+
+        {
+            SKM_PROFILE_SCOPE("fill");
+            std::fill(horizontal_centroids.get(), horizontal_centroids.get() + (n_clusters * d), 0.0);
+            std::fill(cluster_sizes.get(), cluster_sizes.get() + n_clusters, 0);
+        }
+    }
+
     /**
      * @brief Performs assignment and centroid update using GEMM+PRUNING.
      *
@@ -756,6 +854,67 @@ class SuperKMeans {
         }
     }
 
+    void GetQuantizedL2NormsRowMajor(
+        const int8_t* SKM_RESTRICT data,
+        const size_t n,
+        vector_value_t* SKM_RESTRICT out_norm
+    ) {
+        SKM_PROFILE_SCOPE("norms_calc");
+        for (size_t i = 0; i < n; ++i) {
+            const int8_t* row = data + i * d;
+            int32_t acc = 0;
+    #pragma clang loop vectorize(enable)
+            for (size_t j = 0; j < d; ++j) {
+                acc += static_cast<int32_t>(row[j]) * static_cast<int32_t>(row[j]);
+            }
+            out_norm[i] = static_cast<vector_value_t>(acc);
+        }
+    }
+
+    static void QuantizeEmbedding(
+        const float* embedding,
+        const size_t num_dimensions,
+        const float quantization_base,
+        const float quantization_scale,
+        int8_t* output_quantized_embedding
+    ) {
+#pragma clang loop vectorize(enable)
+        for (size_t i = 0; i < num_dimensions; ++i) {
+            float scaled = (embedding[i] - quantization_base) * quantization_scale;
+            uint8_t u = static_cast<uint8_t>(std::min(nearbyintf(scaled), 255.0f));
+            output_quantized_embedding[i] = static_cast<int8_t>(u - 128);
+        }
+    }
+
+    static void QuantizeEmbeddings(
+        const float* embeddings,
+        const size_t total_elements,
+        const size_t num_dimensions,
+        int8_t* output_quantized_embeddings
+    ) {
+        const size_t total_values = total_elements * num_dimensions;
+        float global_min = std::numeric_limits<float>::max();
+        float global_max = std::numeric_limits<float>::lowest();
+#pragma omp parallel for num_threads(g_n_threads) reduction(min:global_min) reduction(max:global_max)
+        for (size_t i = 0; i < total_values; ++i) {
+            global_min = std::min(global_min, embeddings[i]);
+            global_max = std::max(global_max, embeddings[i]);
+        }
+        const float range = global_max - global_min;
+        const float scaling_factor = (range > 0) ? 255.0f / range : 1.0f;
+
+#pragma omp parallel for num_threads(g_n_threads) schedule(static)
+        for (size_t i = 0; i < total_elements; ++i) {
+            QuantizeEmbedding(
+                &embeddings[i * num_dimensions],
+                num_dimensions,
+                global_min,
+                scaling_factor,
+                &output_quantized_embeddings[i * num_dimensions]
+            );
+        }
+    }
+
     /**
      * @brief Runs a single K-Means iteration with either GEMM-only or GEMM+PRUNING strategy.
      *
@@ -788,17 +947,73 @@ class SuperKMeans {
         const size_t n_clusters,
         size_t& iter_idx,
         const bool is_first_iter,
-        std::vector<SuperKMeansIterationStats>& target_stats
+        std::vector<SuperKMeansIterationStats>& target_stats,
+        const bool run_quantized_assignment = false,
+        const bool run_f32_assignment = true,
+        const bool use_quantized_assignment_for_update = false
     ) {
+        std::cout << "Training centroids iteration=" << iter_idx << std::endl;
+
         if (!is_first_iter) {
             std::swap(horizontal_centroids, prev_centroids);
         }
 
         if constexpr (GEMM_ONLY) {
-            GetL2NormsRowMajor(prev_centroids.get(), n_clusters, centroid_norms.get());
-            FirstAssignAndUpdateCentroids(
-                data_to_cluster, prev_centroids.get(), tmp_distances_buf, n_samples, n_clusters
-            );
+            std::cout << "Running GEMM loop" << std::endl;
+
+            // f32 assignment path
+            if (run_f32_assignment) {
+                SKM_PROFILE_SCOPE("assign_f32");
+                GetL2NormsRowMajor(prev_centroids.get(), n_clusters, centroid_norms.get());
+                FirstAssignAndUpdateCentroids(
+                    data_to_cluster, prev_centroids.get(), tmp_distances_buf, n_samples, n_clusters
+                );
+            }
+
+            // Quantized (i8) assignment path
+            std::vector<uint32_t> i8_assignments;
+            std::vector<float> i8_distances;
+            if (run_quantized_assignment) {
+                i8_assignments.resize(n_samples);
+                i8_distances.assign(n_samples, std::numeric_limits<float>::max());
+                SKM_PROFILE_SCOPE("assign_i8");
+                QuantizedAssignAndUpdateCentroids(
+                    data_to_cluster, prev_centroids.get(), n_samples, n_clusters,
+                    i8_assignments.data(), i8_distances.data()
+                );
+            }
+
+            // Compare i8 vs f32 assignments when both were computed
+            if (run_quantized_assignment && run_f32_assignment) {
+                SKM_PROFILE_SCOPE("compare_assignments");
+                size_t match = 0;
+                size_t mismatches_printed = 0;
+                for (size_t s = 0; s < n_samples; ++s) {
+                    if (i8_assignments[s] == assignments[s]) {
+                        ++match;
+                        continue;
+                    }
+                    if (mismatches_printed < 10) {
+                        std::cout << "  mismatch[" << mismatches_printed << "] sample=" << s
+                                  << " i8_assign=" << i8_assignments[s]
+                                  << " f32_assign=" << assignments[s]
+                                  << " i8_dist=" << i8_distances[s]
+                                  << " f32_dist=" << distances[s]
+                                  << " delta=" << (i8_distances[s] - distances[s]) << std::endl;
+                        ++mismatches_printed;
+                    }
+                }
+                std::cout << "i8 vs f32 assignment agreement: " << match << " / " << n_samples
+                          << " (" << (100.0 * match / n_samples) << "%)" << std::endl;
+            }
+
+            // Choose which assignments drive centroid updates
+            if (use_quantized_assignment_for_update) {
+                assert(!i8_assignments.empty() && i8_assignments.size() == n_samples);
+                std::copy(i8_assignments.begin(), i8_assignments.end(), assignments.get());
+                std::copy(i8_distances.begin(), i8_distances.end(), distances.get());
+            }
+
         } else {
             GetPartialL2NormsRowMajor(
                 prev_centroids.get(), n_clusters, centroids_partial_norms.data(), partial_d
@@ -1471,6 +1686,10 @@ class SuperKMeans {
     std::unique_ptr<vector_value_t[]> centroid_norms;
     std::unique_ptr<size_t[]> sampled_indices;
 
+    // Quantized data (computed once, reused across iterations)
+    std::vector<int8_t> quantized_data;
+    std::unique_ptr<vector_value_t[]> quantized_data_norms;
+
     // Buffers for ground truth and recall computation
     std::unique_ptr<uint32_t[]> gt_assignments;
     std::unique_ptr<distance_t[]> gt_distances;

From 6448e3c46a803846a90e528e79eb8592f65dfd8c Mon Sep 17 00:00:00 2001
From: Joseph Hwang <joseph@motherduck.com>
Date: Sun, 22 Mar 2026 22:40:30 -0700
Subject: [PATCH 2/5] Matmul

---
 .../superkmeans/distance_computers/matmul.h   | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 include/superkmeans/distance_computers/matmul.h

diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h
new file mode 100644
index 0000000..4aa0bdc
--- /dev/null
+++ b/include/superkmeans/distance_computers/matmul.h
@@ -0,0 +1,314 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <utility>
+#include <vector>
+#include <omp.h>
+
+#include "superkmeans/common.h"
+#include "superkmeans/profiler.h"
+
+#include <Eigen/Dense>
+#include <pthreadpool.h>
+#include <xnnpack.h>
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+// Eigen already declares sgemm_, so we don't need to redeclare it
+extern "C" {}
+
+namespace skmeans {
+namespace matmul {
+
+inline bool UseXnnpack() {
+    static const bool enabled = (std::getenv("USE_XNNPACK") != nullptr);
+    return enabled;
+}
+
+inline bool InitXnnpack() {
+    static const bool ok = (xnn_initialize(nullptr) == xnn_status_success);
+    return ok;
+}
+
+/**
+ * @brief Packs kernel rows into a contiguous partial-dimension buffer for XNNPACK.
+ *
+ * When partial_d < d, XNNPACK requires the kernel to be contiguous N × partial_d.
+ * This copies the first partial_d elements from each of the n_rows rows (stride d)
+ * into a thread-local contiguous buffer and returns a pointer to it.
+ * When partial_d == d, returns the original pointer with no copy.
+ *
+ * @return Pointer to contiguous kernel data and the effective dimension count.
+ */
+template <typename T>
+inline std::pair<const T*, size_t> PackKernelForPartialD(
+    const T* kernel, size_t n_rows, size_t d, size_t partial_d
+) {
+    const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d;
+    if (partial_d_ == d) {
+        return {kernel, d};
+    }
+    thread_local std::vector<char> buf;
+    buf.resize(n_rows * partial_d_ * sizeof(T));
+    auto* dst = reinterpret_cast<T*>(buf.data());
+    for (size_t i = 0; i < n_rows; ++i) {
+        std::memcpy(dst + i * partial_d_, kernel + i * d, partial_d_ * sizeof(T));
+    }
+    return {dst, partial_d_};
+}
+
+/**
+ * @brief Performs matrix multiplication using XNNPACK's fully-connected operator: C = X * Y^T
+ *
+ * Uses XNNPACK's NC F32 fully-connected operator where X is the "input" (batch_n_x × d)
+ * and Y is the "kernel" (batch_n_y × d). Supports partial-dimension computation via partial_d:
+ * the input (X) uses XNNPACK's input_stride to skip trailing dimensions without copying,
+ * while the kernel (Y) rows are packed into a contiguous buffer when partial_d < d.
+ *
+ * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions.
+ */
+inline void XnnpackMatmulF32(
+    const float* SKM_RESTRICT batch_x_p,
+    const float* SKM_RESTRICT batch_y_p,
+    const size_t batch_n_x,
+    const size_t batch_n_y,
+    const size_t d,
+    const size_t partial_d,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    SKM_PROFILE_SCOPE("xnnpack_f32");
+    const float* kernel_p;
+    size_t partial_d_;
+    {
+        SKM_PROFILE_SCOPE("xnnpack_f32/pack");
+        std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d);
+    }
+
+    {
+        SKM_PROFILE_SCOPE("xnnpack_f32/matmul");
+        thread_local pthreadpool_t tp = pthreadpool_create(omp_get_max_threads());
+        xnn_operator_t op = nullptr;
+        xnn_create_fully_connected_nc_f32(
+            partial_d_, batch_n_y, d, batch_n_y,
+            kernel_p, nullptr,
+            -std::numeric_limits<float>::infinity(),
+            std::numeric_limits<float>::infinity(),
+            0, nullptr, &op
+        );
+        xnn_reshape_fully_connected_nc_f32(op, batch_n_x, tp);
+        xnn_setup_fully_connected_nc_f32(op, batch_x_p, tmp_distances_buf);
+        xnn_run_operator(op, tp);
+        xnn_delete_operator(op);
+    }
+}
+
+/**
+ * @brief Performs int8→float32 matrix multiplication using XNNPACK: C = X * Y^T
+ *
+ * Uses XNNPACK's dynamic-quantized int8 fully-connected operator (qd8_f32_qc8w).
+ * Identity quantization (scale=1.0, zero_point=0) so the result is the integer dot product
+ * cast to float. Supports partial-dimension computation via partial_d.
+ *
+ * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions.
+ */
+inline void XnnpackMatmulI8F32(
+    const int8_t* SKM_RESTRICT batch_x_p,
+    const int8_t* SKM_RESTRICT batch_y_p,
+    const size_t batch_n_x,
+    const size_t batch_n_y,
+    const size_t d,
+    const size_t partial_d,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    SKM_PROFILE_SCOPE("xnnpack_i8f32");
+    const int8_t* kernel_p;
+    size_t partial_d_;
+    {
+        SKM_PROFILE_SCOPE("xnnpack_i8f32/pack");
+        std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d);
+    }
+
+    {
+        SKM_PROFILE_SCOPE("xnnpack_i8f32/matmul");
+        thread_local std::vector<float> kernel_scale;
+        kernel_scale.assign(batch_n_y, 1.0f);
+
+        thread_local std::vector<xnn_quantization_params> qparams;
+        qparams.assign(batch_n_x, {0, 1.0f});
+
+        thread_local pthreadpool_t tp_i8 = pthreadpool_create(omp_get_max_threads());
+        xnn_operator_t op = nullptr;
+        xnn_create_fully_connected_nc_qd8_f32_qc8w(
+            partial_d_, batch_n_y, d, batch_n_y,
+            kernel_scale.data(), kernel_p, nullptr,
+            -std::numeric_limits<float>::infinity(),
+            std::numeric_limits<float>::infinity(),
+            0, nullptr, &op
+        );
+        size_t ws_size = 0;
+        xnn_reshape_fully_connected_nc_qd8_f32_qc8w(op, batch_n_x, &ws_size, tp_i8);
+        thread_local std::vector<char> ws;
+        ws.resize(ws_size);
+        xnn_setup_fully_connected_nc_qd8_f32_qc8w(
+            op, batch_x_p, tmp_distances_buf, ws.data(), qparams.data()
+        );
+        xnn_run_operator(op, tp_i8);
+        xnn_delete_operator(op);
+    }
+}
+
+#if defined(__aarch64__)
+/**
+ * @brief Performs int8 matrix multiplication using NEON SDOT: C = X * Y^T
+ *
+ * Hand-written kernel using vdotq_s32 (SDOT) instructions for int8 dot products.
+ * Both X and Y are row-major with stride d; only the first partial_d dimensions are used.
+ * Output is int32 accumulation of the dot products.
+ */
+inline void NeonSdotMatmulI8(
+    const int8_t* SKM_RESTRICT batch_x_p,
+    const int8_t* SKM_RESTRICT batch_y_p,
+    const size_t batch_n_x,
+    const size_t batch_n_y,
+    const size_t d,
+    const size_t partial_d,
+    int32_t* SKM_RESTRICT tmp_distances_buf
+) {
+    const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d;
+    const size_t K16 = partial_d_ & ~(size_t)15;
+
+#pragma omp parallel for schedule(static) num_threads(g_n_threads)
+    for (size_t m = 0; m < batch_n_x; m++) {
+        const int8_t* a_row = batch_x_p + m * d;
+        int32_t* c_row = tmp_distances_buf + m * batch_n_y;
+
+        size_t n = 0;
+        for (; n + 4 <= batch_n_y; n += 4) {
+            const int8_t* b0 = batch_y_p + (n + 0) * d;
+            const int8_t* b1 = batch_y_p + (n + 1) * d;
+            const int8_t* b2 = batch_y_p + (n + 2) * d;
+            const int8_t* b3 = batch_y_p + (n + 3) * d;
+
+            int32x4_t acc0 = vdupq_n_s32(0);
+            int32x4_t acc1 = vdupq_n_s32(0);
+            int32x4_t acc2 = vdupq_n_s32(0);
+            int32x4_t acc3 = vdupq_n_s32(0);
+
+            for (size_t k = 0; k < K16; k += 16) {
+                int8x16_t va = vld1q_s8(a_row + k);
+                acc0 = vdotq_s32(acc0, va, vld1q_s8(b0 + k));
+                acc1 = vdotq_s32(acc1, va, vld1q_s8(b1 + k));
+                acc2 = vdotq_s32(acc2, va, vld1q_s8(b2 + k));
+                acc3 = vdotq_s32(acc3, va, vld1q_s8(b3 + k));
+            }
+
+            int32_t s0 = vaddvq_s32(acc0), s1 = vaddvq_s32(acc1);
+            int32_t s2 = vaddvq_s32(acc2), s3 = vaddvq_s32(acc3);
+
+            for (size_t k = K16; k < partial_d_; k++) {
+                int32_t a_val = a_row[k];
+                s0 += a_val * static_cast<int32_t>(b0[k]);
+                s1 += a_val * static_cast<int32_t>(b1[k]);
+                s2 += a_val * static_cast<int32_t>(b2[k]);
+                s3 += a_val * static_cast<int32_t>(b3[k]);
+            }
+
+            c_row[n + 0] = s0;
+            c_row[n + 1] = s1;
+            c_row[n + 2] = s2;
+            c_row[n + 3] = s3;
+        }
+
+        for (; n < batch_n_y; n++) {
+            const int8_t* b_row = batch_y_p + n * d;
+            int32x4_t acc = vdupq_n_s32(0);
+            size_t k = 0;
+            for (; k < K16; k += 16)
+                acc = vdotq_s32(acc, vld1q_s8(a_row + k), vld1q_s8(b_row + k));
+            int32_t sum = vaddvq_s32(acc);
+            for (; k < partial_d_; k++)
+                sum += static_cast<int32_t>(a_row[k]) * static_cast<int32_t>(b_row[k]);
+            c_row[n] = sum;
+        }
+    }
+}
+#endif
+
+/**
+ * @brief Performs matrix multiplication via BLAS sgemm: C = X * Y^T
+ *
+ * Pure sgemm call with partial-dimension support via the k parameter.
+ * Both X (batch_n_x × d) and Y (batch_n_y × d) are row-major; sgemm handles
+ * the transpose by using the leading dimension as stride.
+ */
+inline void BlasMatrixMultiplication(
+    const float* SKM_RESTRICT batch_x_p,
+    const float* SKM_RESTRICT batch_y_p,
+    const size_t batch_n_x,
+    const size_t batch_n_y,
+    const size_t d,
+    const size_t partial_d,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    const char trans_a = 'T';
+    const char trans_b = 'N';
+
+    int m = static_cast<int>(batch_n_y);
+    int n = static_cast<int>(batch_n_x);
+    int k = static_cast<int>(partial_d > 0 && partial_d < d ? partial_d : d);
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    int lda = static_cast<int>(d);
+    int ldb = static_cast<int>(d);
+    int ldc = static_cast<int>(batch_n_y);
+
+    sgemm_(
+        &trans_a,
+        &trans_b,
+        &m,
+        &n,
+        &k,
+        &alpha,
+        batch_y_p,
+        &lda,
+        batch_x_p,
+        &ldb,
+        &beta,
+        tmp_distances_buf,
+        &ldc
+    );
+}
+
+/**
+ * @brief Generic matrix multiplication dispatcher: C = X * Y^T
+ *
+ * Routes to XNNPACK (if USE_XNNPACK env var is set) or falls back to BLAS sgemm.
+ * Supports partial-dimension computation via partial_d.
+ */
+inline void MatrixMultiplication(
+    const float* SKM_RESTRICT batch_x_p,
+    const float* SKM_RESTRICT batch_y_p,
+    const size_t batch_n_x,
+    const size_t batch_n_y,
+    const size_t d,
+    const size_t partial_d,
+    float* SKM_RESTRICT tmp_distances_buf
+) {
+    if (UseXnnpack() && InitXnnpack()) {
+        XnnpackMatmulF32(
+            batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf
+        );
+        return;
+    }
+    BlasMatrixMultiplication(
+        batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf
+    );
+}
+
+} // namespace matmul
+} // namespace skmeans

From f00dd076daae88dd4040aaeeff3fcd9b43177de7 Mon Sep 17 00:00:00 2001
From: Joseph Hwang <joseph@motherduck.com>
Date: Sun, 22 Mar 2026 23:14:48 -0700
Subject: [PATCH 3/5] XNNpack

---
 .../end_to_end/end_to_end_superkmeans.cpp     |   6 +-
 .../superkmeans/distance_computers/matmul.h   | 148 +++++++-----------
 include/superkmeans/superkmeans.h             |  17 +-
 3 files changed, 62 insertions(+), 109 deletions(-)

diff --git a/benchmarks/end_to_end/end_to_end_superkmeans.cpp b/benchmarks/end_to_end/end_to_end_superkmeans.cpp
index 9b50099..59361de 100644
--- a/benchmarks/end_to_end/end_to_end_superkmeans.cpp
+++ b/benchmarks/end_to_end/end_to_end_superkmeans.cpp
@@ -67,15 +67,15 @@ int main(int argc, char* argv[]) {
 
     skmeans::SuperKMeansConfig config;
     config.iters = n_iters;
-    config.verbose = true;
+    config.verbose = false;
     config.n_threads = THREADS;
     config.objective_k = 100;
     config.ann_explore_fraction = 0.01f;
     config.unrotate_centroids = true;
     config.early_termination = false;
     config.sampling_fraction = sampling_fraction;
-    // TODO: For joseph experiment
-    config.use_blas_only = false;
+    // Used for this set of benchmarks
+    config.use_blas_only = true;
 
     auto is_angular = std::find(
         bench_utils::ANGULAR_DATASETS.begin(), bench_utils::ANGULAR_DATASETS.end(), dataset
diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h
index 4aa0bdc..742acdd 100644
--- a/include/superkmeans/distance_computers/matmul.h
+++ b/include/superkmeans/distance_computers/matmul.h
@@ -36,14 +36,13 @@ inline bool InitXnnpack() {
 }
 
 /**
- * @brief Packs kernel rows into a contiguous partial-dimension buffer for XNNPACK.
+ * XNNPack does not support performing matrix multiplication on an arbitrary subset of dimensions
+ * Based on our benchmarks - we believe it is sufficiently performant to just re-copy the subset of dimensions that we care about
  *
  * When partial_d < d, XNNPACK requires the kernel to be contiguous N × partial_d.
  * This copies the first partial_d elements from each of the n_rows rows (stride d)
  * into a thread-local contiguous buffer and returns a pointer to it.
- * When partial_d == d, returns the original pointer with no copy.
- *
- * @return Pointer to contiguous kernel data and the effective dimension count.
+ * If partial_d == d, returns the original pointer with no copy.
  */
 template <typename T>
 inline std::pair<const T*, size_t> PackKernelForPartialD(
@@ -53,6 +52,7 @@ inline std::pair<const T*, size_t> PackKernelForPartialD(
     if (partial_d_ == d) {
         return {kernel, d};
     }
+    // Note: this file uses threadlocals to avoid repeated allocations
     thread_local std::vector<char> buf;
     buf.resize(n_rows * partial_d_ * sizeof(T));
     auto* dst = reinterpret_cast<T*>(buf.data());
@@ -63,14 +63,10 @@ inline std::pair<const T*, size_t> PackKernelForPartialD(
 }
 
 /**
- * @brief Performs matrix multiplication using XNNPACK's fully-connected operator: C = X * Y^T
- *
  * Uses XNNPACK's NC F32 fully-connected operator where X is the "input" (batch_n_x × d)
  * and Y is the "kernel" (batch_n_y × d). Supports partial-dimension computation via partial_d:
  * the input (X) uses XNNPACK's input_stride to skip trailing dimensions without copying,
  * while the kernel (Y) rows are packed into a contiguous buffer when partial_d < d.
- *
- * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions.
  */
 inline void XnnpackMatmulF32(
     const float* SKM_RESTRICT batch_x_p,
@@ -86,6 +82,8 @@ inline void XnnpackMatmulF32(
     size_t partial_d_;
     {
         SKM_PROFILE_SCOPE("xnnpack_f32/pack");
+        // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride),
+        // but requires Y to be contiguous N × partial_d.
         std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d);
     }
 
@@ -93,16 +91,37 @@ inline void XnnpackMatmulF32(
         SKM_PROFILE_SCOPE("xnnpack_f32/matmul");
         thread_local pthreadpool_t tp = pthreadpool_create(omp_get_max_threads());
         xnn_operator_t op = nullptr;
+        // Create the fully-connected operator. XNNPACK's API is designed for neural network layers,
+        // so we have to pass several parameters (bias, clamping, flags) that we don't use.
         xnn_create_fully_connected_nc_f32(
-            partial_d_, batch_n_y, d, batch_n_y,
-            kernel_p, nullptr,
+            // input_channels: number of dimensions to dot-product over
+            partial_d_,
+            // output_channels: number of rows in Y (columns in the output matrix)
+            batch_n_y,
+            // input_stride: full row stride in X, even if we only use partial_d_ dims
+            d,
+            // output_stride
+            batch_n_y,
+            // the Y matrix
+            kernel_p,
+            // Do not set a "bias" since we want raw dot products. The "bias" is a vector that is added to the output.
+            nullptr,
+            // no output clamping (we should be able to use the full range of f32 values)
             -std::numeric_limits<float>::infinity(),
             std::numeric_limits<float>::infinity(),
-            0, nullptr, &op
+            // Don't set any special flags
+            0,
+            // Do not set a cache; we recreate the operator each call
+            nullptr,
+            &op
         );
+        // Allocate internal buffers for batch_n_x input rows
         xnn_reshape_fully_connected_nc_f32(op, batch_n_x, tp);
+        // Bind X as input and tmp_distances_buf as output
         xnn_setup_fully_connected_nc_f32(op, batch_x_p, tmp_distances_buf);
+        // Execute the matmul using the thread pool
         xnn_run_operator(op, tp);
+        // Free the operator (the thread pool is reused across calls)
         xnn_delete_operator(op);
     }
 }
@@ -130,6 +149,8 @@ inline void XnnpackMatmulI8F32(
     size_t partial_d_;
     {
         SKM_PROFILE_SCOPE("xnnpack_i8f32/pack");
+        // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride),
+        // but requires Y to be contiguous N × partial_d.
         std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d);
     }
 
@@ -144,101 +165,45 @@ inline void XnnpackMatmulI8F32(
         thread_local pthreadpool_t tp_i8 = pthreadpool_create(omp_get_max_threads());
         xnn_operator_t op = nullptr;
         xnn_create_fully_connected_nc_qd8_f32_qc8w(
-            partial_d_, batch_n_y, d, batch_n_y,
-            kernel_scale.data(), kernel_p, nullptr,
+            // input_channels: number of dimensions to dot-product over
+            partial_d_,
+            // output_channels: number of rows in Y (columns in the output matrix)
+            batch_n_y,
+            // input_stride: full row stride in X, even if we only use partial_d_ dims
+            d,
+            // output_stride
+            batch_n_y,
+            // per-row scale for Y; set to 1.0 for identity quantization
+            kernel_scale.data(),
+            // the Y matrix (XNNPACK calls this "kernel")
+            kernel_p,
+            // Do not set a "bias" since we want raw dot products. The "bias" is a vector that is added to the output.
+            nullptr,
+            // no output clamping (we should be able to use the full range of f32 values)
             -std::numeric_limits<float>::infinity(),
             std::numeric_limits<float>::infinity(),
-            0, nullptr, &op
+            // Don't set any special flags
+            0,
+            // Do not set a cache; we recreate the operator each call
+            nullptr,
+            &op
         );
+        // Allocate internal buffers for batch_n_x input rows; also reports workspace size needed
         size_t ws_size = 0;
         xnn_reshape_fully_connected_nc_qd8_f32_qc8w(op, batch_n_x, &ws_size, tp_i8);
         thread_local std::vector<char> ws;
         ws.resize(ws_size);
+        // Bind X as input, tmp_distances_buf as output, plus workspace and quantization params
         xnn_setup_fully_connected_nc_qd8_f32_qc8w(
             op, batch_x_p, tmp_distances_buf, ws.data(), qparams.data()
         );
+        // Execute the matmul using the thread pool
         xnn_run_operator(op, tp_i8);
+        // Free the operator (the thread pool is reused across calls)
         xnn_delete_operator(op);
     }
 }
 
-#if defined(__aarch64__)
-/**
- * @brief Performs int8 matrix multiplication using NEON SDOT: C = X * Y^T
- *
- * Hand-written kernel using vdotq_s32 (SDOT) instructions for int8 dot products.
- * Both X and Y are row-major with stride d; only the first partial_d dimensions are used.
- * Output is int32 accumulation of the dot products.
- */
-inline void NeonSdotMatmulI8(
-    const int8_t* SKM_RESTRICT batch_x_p,
-    const int8_t* SKM_RESTRICT batch_y_p,
-    const size_t batch_n_x,
-    const size_t batch_n_y,
-    const size_t d,
-    const size_t partial_d,
-    int32_t* SKM_RESTRICT tmp_distances_buf
-) {
-    const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d;
-    const size_t K16 = partial_d_ & ~(size_t)15;
-
-#pragma omp parallel for schedule(static) num_threads(g_n_threads)
-    for (size_t m = 0; m < batch_n_x; m++) {
-        const int8_t* a_row = batch_x_p + m * d;
-        int32_t* c_row = tmp_distances_buf + m * batch_n_y;
-
-        size_t n = 0;
-        for (; n + 4 <= batch_n_y; n += 4) {
-            const int8_t* b0 = batch_y_p + (n + 0) * d;
-            const int8_t* b1 = batch_y_p + (n + 1) * d;
-            const int8_t* b2 = batch_y_p + (n + 2) * d;
-            const int8_t* b3 = batch_y_p + (n + 3) * d;
-
-            int32x4_t acc0 = vdupq_n_s32(0);
-            int32x4_t acc1 = vdupq_n_s32(0);
-            int32x4_t acc2 = vdupq_n_s32(0);
-            int32x4_t acc3 = vdupq_n_s32(0);
-
-            for (size_t k = 0; k < K16; k += 16) {
-                int8x16_t va = vld1q_s8(a_row + k);
-                acc0 = vdotq_s32(acc0, va, vld1q_s8(b0 + k));
-                acc1 = vdotq_s32(acc1, va, vld1q_s8(b1 + k));
-                acc2 = vdotq_s32(acc2, va, vld1q_s8(b2 + k));
-                acc3 = vdotq_s32(acc3, va, vld1q_s8(b3 + k));
-            }
-
-            int32_t s0 = vaddvq_s32(acc0), s1 = vaddvq_s32(acc1);
-            int32_t s2 = vaddvq_s32(acc2), s3 = vaddvq_s32(acc3);
-
-            for (size_t k = K16; k < partial_d_; k++) {
-                int32_t a_val = a_row[k];
-                s0 += a_val * static_cast<int32_t>(b0[k]);
-                s1 += a_val * static_cast<int32_t>(b1[k]);
-                s2 += a_val * static_cast<int32_t>(b2[k]);
-                s3 += a_val * static_cast<int32_t>(b3[k]);
-            }
-
-            c_row[n + 0] = s0;
-            c_row[n + 1] = s1;
-            c_row[n + 2] = s2;
-            c_row[n + 3] = s3;
-        }
-
-        for (; n < batch_n_y; n++) {
-            const int8_t* b_row = batch_y_p + n * d;
-            int32x4_t acc = vdupq_n_s32(0);
-            size_t k = 0;
-            for (; k < K16; k += 16)
-                acc = vdotq_s32(acc, vld1q_s8(a_row + k), vld1q_s8(b_row + k));
-            int32_t sum = vaddvq_s32(acc);
-            for (; k < partial_d_; k++)
-                sum += static_cast<int32_t>(a_row[k]) * static_cast<int32_t>(b_row[k]);
-            c_row[n] = sum;
-        }
-    }
-}
-#endif
-
 /**
  * @brief Performs matrix multiplication via BLAS sgemm: C = X * Y^T
  *
@@ -299,6 +264,7 @@ inline void MatrixMultiplication(
     const size_t partial_d,
     float* SKM_RESTRICT tmp_distances_buf
 ) {
+    // While we do support XnnpackMatmulF32 - we don't use it in any e2e benchmarks as of now. 
     if (UseXnnpack() && InitXnnpack()) {
         XnnpackMatmulF32(
             batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf
diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h
index b0e7faf..9e9d3a5 100644
--- a/include/superkmeans/superkmeans.h
+++ b/include/superkmeans/superkmeans.h
@@ -948,19 +948,15 @@ class SuperKMeans {
         size_t& iter_idx,
         const bool is_first_iter,
         std::vector<SuperKMeansIterationStats>& target_stats,
-        const bool run_quantized_assignment = false,
+        const bool run_quantized_assignment = true,
         const bool run_f32_assignment = true,
-        const bool use_quantized_assignment_for_update = false
+        const bool use_quantized_assignment_for_update = true
     ) {
-        std::cout << "Training centroids iteration=" << iter_idx << std::endl;
-
         if (!is_first_iter) {
             std::swap(horizontal_centroids, prev_centroids);
         }
 
         if constexpr (GEMM_ONLY) {
-            std::cout << "Running GEMM loop" << std::endl;
-
             // f32 assignment path
             if (run_f32_assignment) {
                 SKM_PROFILE_SCOPE("assign_f32");
@@ -993,15 +989,6 @@ class SuperKMeans {
                         ++match;
                         continue;
                     }
-                    if (mismatches_printed < 10) {
-                        std::cout << "  mismatch[" << mismatches_printed << "] sample=" << s
-                                  << " i8_assign=" << i8_assignments[s]
-                                  << " f32_assign=" << assignments[s]
-                                  << " i8_dist=" << i8_distances[s]
-                                  << " f32_dist=" << distances[s]
-                                  << " delta=" << (i8_distances[s] - distances[s]) << std::endl;
-                        ++mismatches_printed;
-                    }
                 }
                 std::cout << "i8 vs f32 assignment agreement: " << match << " / " << n_samples
                           << " (" << (100.0 * match / n_samples) << "%)" << std::endl;

From 4a5cb30385d243ef627383a619616403a053c636 Mon Sep 17 00:00:00 2001
From: Joseph Hwang <joseph@motherduck.com>
Date: Sun, 22 Mar 2026 23:25:40 -0700
Subject: [PATCH 4/5] Other changes

---
 .../distance_computers/batch_computers.h            | 13 +++++--------
 include/superkmeans/distance_computers/matmul.h     |  9 ++-------
 include/superkmeans/superkmeans.h                   |  4 ++--
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/include/superkmeans/distance_computers/batch_computers.h b/include/superkmeans/distance_computers/batch_computers.h
index 2d96ab0..160a76a 100644
--- a/include/superkmeans/distance_computers/batch_computers.h
+++ b/include/superkmeans/distance_computers/batch_computers.h
@@ -73,7 +73,7 @@ inline void FindKNearestNeighborsI8(
 ) {
     using MatrixR = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     matmul::InitXnnpack();
-    SKM_PROFILE_SCOPE("blas_i8_knn");
+    SKM_PROFILE_SCOPE("i8_knn");
     std::fill_n(out_distances, n_x * k, std::numeric_limits<float>::max());
     std::fill_n(out_knn, n_x * k, static_cast<uint32_t>(-1));
 
@@ -89,15 +89,12 @@ inline void FindKNearestNeighborsI8(
             if (j + Y_BATCH_SIZE > n_y) {
                 batch_n_y = n_y - j;
             }
-            {
-                SKM_PROFILE_SCOPE("blas_i8_knn/gemm");
-                matmul::XnnpackMatmulI8F32(
-                    batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf
-                );
-            }
+            matmul::XnnpackMatmulI8F32(
+                batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf
+            );
             Eigen::Map<MatrixR> distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y);
 
-            SKM_PROFILE_SCOPE("blas_i8_knn/top_k");
+            SKM_PROFILE_SCOPE("i8_knn/top_k");
 #pragma omp parallel for num_threads(g_n_threads)
             for (size_t r = 0; r < batch_n_x; ++r) {
                 const auto i_idx = i + r;
diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h
index 742acdd..fd6f623 100644
--- a/include/superkmeans/distance_computers/matmul.h
+++ b/include/superkmeans/distance_computers/matmul.h
@@ -15,10 +15,6 @@
 #include <pthreadpool.h>
 #include <xnnpack.h>
 
-#if defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
 // Eigen already declares sgemm_, so we don't need to redeclare it
 extern "C" {}
 
@@ -144,18 +140,17 @@ inline void XnnpackMatmulI8F32(
     const size_t partial_d,
     float* SKM_RESTRICT tmp_distances_buf
 ) {
-    SKM_PROFILE_SCOPE("xnnpack_i8f32");
     const int8_t* kernel_p;
     size_t partial_d_;
     {
-        SKM_PROFILE_SCOPE("xnnpack_i8f32/pack");
+        SKM_PROFILE_SCOPE("i8_knn/pack");
         // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride),
         // but requires Y to be contiguous N × partial_d.
         std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d);
     }
 
     {
-        SKM_PROFILE_SCOPE("xnnpack_i8f32/matmul");
+        SKM_PROFILE_SCOPE("i8_knn/matmul");
         thread_local std::vector<float> kernel_scale;
         kernel_scale.assign(batch_n_y, 1.0f);
 
diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h
index 9e9d3a5..1eea344 100644
--- a/include/superkmeans/superkmeans.h
+++ b/include/superkmeans/superkmeans.h
@@ -699,12 +699,12 @@ class SuperKMeans {
             GetQuantizedL2NormsRowMajor(quantized_centroids_buf.data(), n_clusters, quantized_centroid_norms.get());
         }
 
-        const size_t i8_k = 2;
+        const size_t i8_k = 1;
         std::vector<uint32_t> i8_knn(n_samples * i8_k);
         std::vector<float> i8_knn_distances(n_samples * i8_k);
         std::vector<float> i8_tmp_distances_buf(X_BATCH_SIZE * Y_BATCH_SIZE);
         {
-            SKM_PROFILE_SCOPE("assign_i8/knn");
+            SKM_PROFILE_SCOPE("assign_i8/i8_knn");
             FindKNearestNeighborsI8(
                 quantized_data.data(),
                 quantized_centroids_buf.data(),

From 50865fcc4c002d208321c666f9bb0d636ba4b53b Mon Sep 17 00:00:00 2001
From: Joseph Hwang <joseph@motherduck.com>
Date: Sun, 22 Mar 2026 23:50:07 -0700
Subject: [PATCH 5/5] Ok

---
 f32_benchmark_output.txt                      | 117 ++++++++++++++++
 f32_xnnpack_benchmark_output.txt              | 120 ++++++++++++++++
 ...nnpack_10_knn_rescore_benchmark_output.txt | 132 ++++++++++++++++++
 i8_xnnpack_benchmark_output.txt               | 131 +++++++++++++++++
 include/superkmeans/superkmeans.h             |   5 +-
 5 files changed, 502 insertions(+), 3 deletions(-)
 create mode 100644 f32_benchmark_output.txt
 create mode 100644 f32_xnnpack_benchmark_output.txt
 create mode 100644 i8_xnnpack_10_knn_rescore_benchmark_output.txt
 create mode 100644 i8_xnnpack_benchmark_output.txt

diff --git a/f32_benchmark_output.txt b/f32_benchmark_output.txt
new file mode 100644
index 0000000..ba38eab
--- /dev/null
+++ b/f32_benchmark_output.txt
@@ -0,0 +1,117 @@
+##########################################
+# DATASET: cohere2m
+##########################################
+
+==========================================
+Running benchmarks for cohere2m...
+==========================================
+
+----------------------------------------
+1/3: SuperKMeans
+----------------------------------------
+=== Running algorithm: superkmeans ===
+Dataset: cohere2m (n=2000000, d=1024)
+n_clusters=5656 n_iters=5 sampling_fraction=1
+Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled)
+Front dimensions (d') = 128
+Trailing dimensions (d'') = 768
+Sampling data...
+Using 2000000 vectors
+Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.92 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 2/5 | Objective: 1.13074e+06 | Objective improvement: 0.405949 | Shift: 80.9742 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 3/5 | Objective: 1.09574e+06 | Objective improvement: 0.0309505 | Shift: 30.4903 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 4/5 | Objective: 1.08237e+06 | Objective improvement: 0.012203 | Shift: 16.1337 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 5/5 | Objective: 1.07518e+06 | Objective improvement: 0.00664073 | Shift: 10.0611 | Split: 0 | Recall: 0 [BLAS-only]
+
+
+========== PROFILER RESULTS ==========
+assign_f32                                 111.204s (90.648%) [5 calls]
+rotator                                      5.507s (4.489%) [3 calls]
+update_centroids                             3.041s (2.479%) [5 calls]
+norms_calc                                   2.724s (2.220%) [7 calls]
+fill                                         0.078s (0.063%) [5 calls]
+generating_centroids                         0.077s (0.062%)
+consolidate                                  0.029s (0.024%) [5 calls]
+  - pdxify                                   0.034s (0.028%) [6 calls]
+  - splitting                                0.002s (0.001%) [5 calls]
+  - normalize                                0.000s (0.000%) [5 calls]
+unrotator                                    0.011s (0.009%)
+compute_cost                                 0.003s (0.002%) [5 calls]
+shift                                        0.003s (0.002%) [5 calls]
+allocator                                    0.000s (0.000%)
+-------------------------------------------
+TOTAL                                      122.677s
+===========================================
+
+Training completed in 122746.855 ms
+Actual iterations: 5 (requested: 5)
+Final objective: 1075182.750
+
+--- Computing Recall ---
+Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json
+Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin
+Using 1000 queries (loaded 1000 from ground truth)
+Cluster size stats: mean=353.607, gmean=308.812, std=173.238, CV=0.490, min=15, max=1517
+
+--- Recall@10 ---
+Recall@   5 ( 0.10% centroids,     1969 avg vectors): 0.6299 ± 0.3537
+Recall@  11 ( 0.20% centroids,     4202 avg vectors): 0.7148 ± 0.3268
+Recall@  16 ( 0.30% centroids,     6050 avg vectors): 0.7578 ± 0.3046
+Recall@  22 ( 0.40% centroids,     8230 avg vectors): 0.7848 ± 0.2895
+Recall@  28 ( 0.50% centroids,    10414 avg vectors): 0.8060 ± 0.2763
+Recall@  33 ( 0.60% centroids,    12214 avg vectors): 0.8204 ± 0.2652
+Recall@  39 ( 0.70% centroids,    14371 avg vectors): 0.8350 ± 0.2550
+Recall@  45 ( 0.80% centroids,    16532 avg vectors): 0.8434 ± 0.2492
+Recall@  50 ( 0.90% centroids,    18323 avg vectors): 0.8515 ± 0.2423
+Recall@  56 ( 1.00% centroids,    20488 avg vectors): 0.8587 ± 0.2370
+Recall@  70 ( 1.25% centroids,    25527 avg vectors): 0.8703 ± 0.2267
+Recall@  84 ( 1.50% centroids,    30547 avg vectors): 0.8799 ± 0.2161
+Recall@  98 ( 1.75% centroids,    35559 avg vectors): 0.8890 ± 0.2079
+Recall@ 113 ( 2.00% centroids,    40918 avg vectors): 0.8961 ± 0.1987
+Recall@ 127 ( 2.25% centroids,    45937 avg vectors): 0.9023 ± 0.1914
+Recall@ 141 ( 2.50% centroids,    50928 avg vectors): 0.9065 ± 0.1872
+Recall@ 155 ( 2.75% centroids,    55956 avg vectors): 0.9114 ± 0.1808
+Recall@ 169 ( 3.00% centroids,    60934 avg vectors): 0.9149 ± 0.1764
+Recall@ 183 ( 3.25% centroids,    65893 avg vectors): 0.9177 ± 0.1731
+Recall@ 197 ( 3.50% centroids,    70869 avg vectors): 0.9203 ± 0.1698
+Recall@ 212 ( 3.75% centroids,    76187 avg vectors): 0.9243 ± 0.1649
+Recall@ 226 ( 4.00% centroids,    81156 avg vectors): 0.9282 ± 0.1576
+Recall@ 240 ( 4.25% centroids,    86139 avg vectors): 0.9312 ± 0.1535
+Recall@ 254 ( 4.50% centroids,    91087 avg vectors): 0.9349 ± 0.1497
+Recall@ 268 ( 4.75% centroids,    96056 avg vectors): 0.9378 ± 0.1470
+Recall@ 282 ( 5.00% centroids,   101024 avg vectors): 0.9389 ± 0.1459
+Recall@ 565 (10.00% centroids,   201269 avg vectors): 0.9642 ± 0.1076
+
+--- Recall@100 ---
+Recall@   5 ( 0.10% centroids,     1969 avg vectors): 0.5628 ± 0.2819
+Recall@  11 ( 0.20% centroids,     4202 avg vectors): 0.6625 ± 0.2638
+Recall@  16 ( 0.30% centroids,     6050 avg vectors): 0.7089 ± 0.2510
+Recall@  22 ( 0.40% centroids,     8230 avg vectors): 0.7443 ± 0.2367
+Recall@  28 ( 0.50% centroids,    10414 avg vectors): 0.7661 ± 0.2284
+Recall@  33 ( 0.60% centroids,    12214 avg vectors): 0.7812 ± 0.2212
+Recall@  39 ( 0.70% centroids,    14371 avg vectors): 0.7967 ± 0.2135
+Recall@  45 ( 0.80% centroids,    16532 avg vectors): 0.8084 ± 0.2068
+Recall@  50 ( 0.90% centroids,    18323 avg vectors): 0.8173 ± 0.2009
+Recall@  56 ( 1.00% centroids,    20488 avg vectors): 0.8270 ± 0.1961
+Recall@  70 ( 1.25% centroids,    25527 avg vectors): 0.8421 ± 0.1874
+Recall@  84 ( 1.50% centroids,    30547 avg vectors): 0.8553 ± 0.1782
+Recall@  98 ( 1.75% centroids,    35559 avg vectors): 0.8660 ± 0.1708
+Recall@ 113 ( 2.00% centroids,    40918 avg vectors): 0.8753 ± 0.1637
+Recall@ 127 ( 2.25% centroids,    45937 avg vectors): 0.8830 ± 0.1561
+Recall@ 141 ( 2.50% centroids,    50928 avg vectors): 0.8893 ± 0.1514
+Recall@ 155 ( 2.75% centroids,    55956 avg vectors): 0.8955 ± 0.1450
+Recall@ 169 ( 3.00% centroids,    60934 avg vectors): 0.9002 ± 0.1402
+Recall@ 183 ( 3.25% centroids,    65893 avg vectors): 0.9044 ± 0.1370
+Recall@ 197 ( 3.50% centroids,    70869 avg vectors): 0.9082 ± 0.1335
+Recall@ 212 ( 3.75% centroids,    76187 avg vectors): 0.9127 ± 0.1288
+Recall@ 226 ( 4.00% centroids,    81156 avg vectors): 0.9167 ± 0.1250
+Recall@ 240 ( 4.25% centroids,    86139 avg vectors): 0.9194 ± 0.1227
+Recall@ 254 ( 4.50% centroids,    91087 avg vectors): 0.9225 ± 0.1198
+Recall@ 268 ( 4.75% centroids,    96056 avg vectors): 0.9259 ± 0.1161
+Recall@ 282 ( 5.00% centroids,   101024 avg vectors): 0.9280 ± 0.1141
+Recall@ 565 (10.00% centroids,   201269 avg vectors): 0.9572 ± 0.0804
+Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv
diff --git a/f32_xnnpack_benchmark_output.txt b/f32_xnnpack_benchmark_output.txt
new file mode 100644
index 0000000..1e774f9
--- /dev/null
+++ b/f32_xnnpack_benchmark_output.txt
@@ -0,0 +1,120 @@
+##########################################
+# DATASET: cohere2m
+##########################################
+
+==========================================
+Running benchmarks for cohere2m...
+==========================================
+
+----------------------------------------
+1/3: SuperKMeans
+----------------------------------------
+=== Running algorithm: superkmeans ===
+Dataset: cohere2m (n=2000000, d=1024)
+n_clusters=5656 n_iters=5 sampling_fraction=1
+Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled)
+Front dimensions (d') = 128
+Trailing dimensions (d'') = 768
+Sampling data...
+Using 2000000 vectors
+Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.92 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 2/5 | Objective: 1.13074e+06 | Objective improvement: 0.405949 | Shift: 80.9742 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 3/5 | Objective: 1.09574e+06 | Objective improvement: 0.0309505 | Shift: 30.4903 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 4/5 | Objective: 1.08237e+06 | Objective improvement: 0.012203 | Shift: 16.1337 | Split: 0 | Recall: 0 [BLAS-only]
+
+Iteration 5/5 | Objective: 1.07518e+06 | Objective improvement: 0.00664073 | Shift: 10.0611 | Split: 0 | Recall: 0 [BLAS-only]
+
+
+========== PROFILER RESULTS ==========
+assign_f32                                 266.597s (50.005%) [5 calls]
+xnnpack_f32                                250.760s (47.034%) [110451 calls]
+  - matmul                                 246.981s (46.325%) [110268 calls]
+  - pack                                     0.050s (0.009%) [116917 calls]
+rotator                                      7.984s (1.498%) [3 calls]
+update_centroids                             5.001s (0.938%) [5 calls]
+norms_calc                                   2.627s (0.493%) [7 calls]
+generating_centroids                         0.094s (0.018%)
+consolidate                                  0.042s (0.008%) [5 calls]
+  - pdxify                                   0.049s (0.009%) [6 calls]
+  - splitting                                0.002s (0.000%) [5 calls]
+  - normalize                                0.000s (0.000%) [5 calls]
+fill                                         0.021s (0.004%) [5 calls]
+unrotator                                    0.013s (0.002%)
+compute_cost                                 0.004s (0.001%) [5 calls]
+shift                                        0.003s (0.001%) [5 calls]
+allocator                                    0.000s (0.000%)
+-------------------------------------------
+TOTAL                                      533.147s
+===========================================
+
+Training completed in 282555.208 ms
+Actual iterations: 5 (requested: 5)
+Final objective: 1075182.750
+
+--- Computing Recall ---
+Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json
+Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin
+Using 1000 queries (loaded 1000 from ground truth)
+Cluster size stats: mean=353.607, gmean=308.812, std=173.238, CV=0.490, min=15, max=1517
+
+--- Recall@10 ---
+Recall@   5 ( 0.10% centroids,     1969 avg vectors): 0.6299 ± 0.3537
+Recall@  11 ( 0.20% centroids,     4202 avg vectors): 0.7148 ± 0.3268
+Recall@  16 ( 0.30% centroids,     6050 avg vectors): 0.7578 ± 0.3046
+Recall@  22 ( 0.40% centroids,     8230 avg vectors): 0.7848 ± 0.2895
+Recall@  28 ( 0.50% centroids,    10414 avg vectors): 0.8060 ± 0.2763
+Recall@  33 ( 0.60% centroids,    12214 avg vectors): 0.8204 ± 0.2652
+Recall@  39 ( 0.70% centroids,    14371 avg vectors): 0.8350 ± 0.2550
+Recall@  45 ( 0.80% centroids,    16532 avg vectors): 0.8434 ± 0.2492
+Recall@  50 ( 0.90% centroids,    18323 avg vectors): 0.8515 ± 0.2423
+Recall@  56 ( 1.00% centroids,    20488 avg vectors): 0.8587 ± 0.2370
+Recall@  70 ( 1.25% centroids,    25527 avg vectors): 0.8703 ± 0.2267
+Recall@  84 ( 1.50% centroids,    30547 avg vectors): 0.8799 ± 0.2161
+Recall@  98 ( 1.75% centroids,    35559 avg vectors): 0.8890 ± 0.2079
+Recall@ 113 ( 2.00% centroids,    40918 avg vectors): 0.8961 ± 0.1987
+Recall@ 127 ( 2.25% centroids,    45937 avg vectors): 0.9023 ± 0.1914
+Recall@ 141 ( 2.50% centroids,    50928 avg vectors): 0.9065 ± 0.1872
+Recall@ 155 ( 2.75% centroids,    55956 avg vectors): 0.9114 ± 0.1808
+Recall@ 169 ( 3.00% centroids,    60934 avg vectors): 0.9149 ± 0.1764
+Recall@ 183 ( 3.25% centroids,    65893 avg vectors): 0.9177 ± 0.1731
+Recall@ 197 ( 3.50% centroids,    70869 avg vectors): 0.9203 ± 0.1698
+Recall@ 212 ( 3.75% centroids,    76187 avg vectors): 0.9243 ± 0.1649
+Recall@ 226 ( 4.00% centroids,    81156 avg vectors): 0.9282 ± 0.1576
+Recall@ 240 ( 4.25% centroids,    86139 avg vectors): 0.9312 ± 0.1535
+Recall@ 254 ( 4.50% centroids,    91087 avg vectors): 0.9349 ± 0.1497
+Recall@ 268 ( 4.75% centroids,    96056 avg vectors): 0.9378 ± 0.1470
+Recall@ 282 ( 5.00% centroids,   101024 avg vectors): 0.9389 ± 0.1459
+Recall@ 565 (10.00% centroids,   201269 avg vectors): 0.9642 ± 0.1076
+
+--- Recall@100 ---
+Recall@   5 ( 0.10% centroids,     1969 avg vectors): 0.5628 ± 0.2819
+Recall@  11 ( 0.20% centroids,     4202 avg vectors): 0.6625 ± 0.2638
+Recall@  16 ( 0.30% centroids,     6050 avg vectors): 0.7089 ± 0.2510
+Recall@  22 ( 0.40% centroids,     8230 avg vectors): 0.7443 ± 0.2367
+Recall@  28 ( 0.50% centroids,    10414 avg vectors): 0.7661 ± 0.2284
+Recall@  33 ( 0.60% centroids,    12214 avg vectors): 0.7812 ± 0.2212
+Recall@  39 ( 0.70% centroids,    14371 avg vectors): 0.7967 ± 0.2135
+Recall@  45 ( 0.80% centroids,    16532 avg vectors): 0.8084 ± 0.2068
+Recall@  50 ( 0.90% centroids,    18323 avg vectors): 0.8173 ± 0.2009
+Recall@  56 ( 1.00% centroids,    20488 avg vectors): 0.8270 ± 0.1961
+Recall@  70 ( 1.25% centroids,    25527 avg vectors): 0.8421 ± 0.1874
+Recall@  84 ( 1.50% centroids,    30547 avg vectors): 0.8553 ± 0.1782
+Recall@  98 ( 1.75% centroids,    35559 avg vectors): 0.8660 ± 0.1708
+Recall@ 113 ( 2.00% centroids,    40918 avg vectors): 0.8753 ± 0.1637
+Recall@ 127 ( 2.25% centroids,    45937 avg vectors): 0.8830 ± 0.1561
+Recall@ 141 ( 2.50% centroids,    50928 avg vectors): 0.8893 ± 0.1514
+Recall@ 155 ( 2.75% centroids,    55956 avg vectors): 0.8955 ± 0.1450
+Recall@ 169 ( 3.00% centroids,    60934 avg vectors): 0.9002 ± 0.1402
+Recall@ 183 ( 3.25% centroids,    65893 avg vectors): 0.9044 ± 0.1370
+Recall@ 197 ( 3.50% centroids,    70869 avg vectors): 0.9082 ± 0.1335
+Recall@ 212 ( 3.75% centroids,    76187 avg vectors): 0.9127 ± 0.1288
+Recall@ 226 ( 4.00% centroids,    81156 avg vectors): 0.9167 ± 0.1250
+Recall@ 240 ( 4.25% centroids,    86139 avg vectors): 0.9194 ± 0.1227
+Recall@ 254 ( 4.50% centroids,    91087 avg vectors): 0.9225 ± 0.1198
+Recall@ 268 ( 4.75% centroids,    96056 avg vectors): 0.9259 ± 0.1161
+Recall@ 282 ( 5.00% centroids,   101024 avg vectors): 0.9280 ± 0.1141
+Recall@ 565 (10.00% centroids,   201269 avg vectors): 0.9572 ± 0.0804
+Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv
\ No newline at end of file
diff --git a/i8_xnnpack_10_knn_rescore_benchmark_output.txt b/i8_xnnpack_10_knn_rescore_benchmark_output.txt
new file mode 100644
index 0000000..d9fbe6d
--- /dev/null
+++ b/i8_xnnpack_10_knn_rescore_benchmark_output.txt
@@ -0,0 +1,132 @@
+##########################################
+# DATASET: cohere2m
+##########################################
+
+==========================================
+Running benchmarks for cohere2m...
+==========================================
+
+----------------------------------------
+1/3: SuperKMeans
+----------------------------------------
+=== Running algorithm: superkmeans ===
+Dataset: cohere2m (n=2000000, d=1024)
+n_clusters=5656 n_iters=5 sampling_fraction=1
+Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled)
+Front dimensions (d') = 128
+Trailing dimensions (d'') = 768
+Sampling data...
+Using 2000000 vectors
+i8 vs f32 assignment agreement: 1999953 / 2000000 (99.9976%)
+Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.94 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1991285 / 2000000 (99.5643%)
+Iteration 2/5 | Objective: 1.13085e+06 | Objective improvement: 0.405892 | Shift: 81.4288 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1997058 / 2000000 (99.8529%)
+Iteration 3/5 | Objective: 1.09575e+06 | Objective improvement: 0.0310347 | Shift: 30.5562 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1997574 / 2000000 (99.8787%)
+Iteration 4/5 | Objective: 1.08238e+06 | Objective improvement: 0.0122005 | Shift: 16.2518 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1997675 / 2000000 (99.8838%)
+Iteration 5/5 | Objective: 1.0752e+06 | Objective improvement: 0.00663465 | Shift: 9.95498 | Split: 0 | Recall: 0 [BLAS-only]
+
+
+========== PROFILER RESULTS ==========
+assign_f32                                 111.674s (45.204%) [5 calls]
+assign_i8                                   65.600s (26.554%) [5 calls]
+  - i8_knn                                  60.490s (24.485%) [5 calls]
+  - rerank                                   3.569s (1.445%) [5 calls]
+i8_knn                                      60.490s (24.485%) [5 calls]
+  - matmul                                  52.337s (21.185%) [735 calls]
+  - top_k                                    8.099s (3.278%) [735 calls]
+  - pack                                     0.000s (0.000%) [735 calls]
+rotator                                      5.324s (2.155%) [3 calls]
+norms_calc                                   1.756s (0.711%) [13 calls]
+update_centroids                             1.292s (0.523%) [5 calls]
+quantize_data                                0.717s (0.290%)
+generating_centroids                         0.064s (0.026%)
+consolidate                                  0.048s (0.019%) [5 calls]
+  - pdxify                                   0.051s (0.021%) [6 calls]
+  - splitting                                0.002s (0.001%) [5 calls]
+  - normalize                                0.000s (0.000%) [5 calls]
+fill                                         0.038s (0.015%) [10 calls]
+quantize_centroids                           0.018s (0.007%) [5 calls]
+unrotator                                    0.016s (0.006%)
+compare_assignments                          0.007s (0.003%) [5 calls]
+shift                                        0.003s (0.001%) [5 calls]
+compute_cost                                 0.001s (0.000%) [5 calls]
+allocator                                    0.000s (0.000%)
+-------------------------------------------
+TOTAL                                      247.048s
+===========================================
+
+Training completed in 185900.305 ms
+Actual iterations: 5 (requested: 5)
+Final objective: 1075201.500
+
+--- Computing Recall ---
+Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json
+Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin
+Using 1000 queries (loaded 1000 from ground truth)
+Cluster size stats: mean=353.607, gmean=308.569, std=173.012, CV=0.489, min=13, max=1502
+
+--- Recall@10 ---
+Recall@   5 ( 0.10% centroids,     1974 avg vectors): 0.6276 ± 0.3551
+Recall@  11 ( 0.20% centroids,     4209 avg vectors): 0.7162 ± 0.3258
+Recall@  16 ( 0.30% centroids,     6058 avg vectors): 0.7562 ± 0.3052
+Recall@  22 ( 0.40% centroids,     8247 avg vectors): 0.7840 ± 0.2895
+Recall@  28 ( 0.50% centroids,    10439 avg vectors): 0.8049 ± 0.2756
+Recall@  33 ( 0.60% centroids,    12238 avg vectors): 0.8193 ± 0.2658
+Recall@  39 ( 0.70% centroids,    14402 avg vectors): 0.8344 ± 0.2547
+Recall@  45 ( 0.80% centroids,    16559 avg vectors): 0.8425 ± 0.2500
+Recall@  50 ( 0.90% centroids,    18378 avg vectors): 0.8490 ± 0.2445
+Recall@  56 ( 1.00% centroids,    20543 avg vectors): 0.8580 ± 0.2370
+Recall@  70 ( 1.25% centroids,    25592 avg vectors): 0.8697 ± 0.2261
+Recall@  84 ( 1.50% centroids,    30603 avg vectors): 0.8796 ± 0.2163
+Recall@  98 ( 1.75% centroids,    35642 avg vectors): 0.8887 ± 0.2084
+Recall@ 113 ( 2.00% centroids,    40999 avg vectors): 0.8961 ± 0.1986
+Recall@ 127 ( 2.25% centroids,    46031 avg vectors): 0.9024 ± 0.1915
+Recall@ 141 ( 2.50% centroids,    51038 avg vectors): 0.9067 ± 0.1872
+Recall@ 155 ( 2.75% centroids,    56052 avg vectors): 0.9116 ± 0.1809
+Recall@ 169 ( 3.00% centroids,    61050 avg vectors): 0.9141 ± 0.1775
+Recall@ 183 ( 3.25% centroids,    66004 avg vectors): 0.9173 ± 0.1740
+Recall@ 197 ( 3.50% centroids,    70997 avg vectors): 0.9200 ± 0.1707
+Recall@ 212 ( 3.75% centroids,    76339 avg vectors): 0.9238 ± 0.1664
+Recall@ 226 ( 4.00% centroids,    81305 avg vectors): 0.9276 ± 0.1597
+Recall@ 240 ( 4.25% centroids,    86285 avg vectors): 0.9315 ± 0.1533
+Recall@ 254 ( 4.50% centroids,    91273 avg vectors): 0.9343 ± 0.1509
+Recall@ 268 ( 4.75% centroids,    96262 avg vectors): 0.9365 ± 0.1488
+Recall@ 282 ( 5.00% centroids,   101264 avg vectors): 0.9384 ± 0.1472
+Recall@ 565 (10.00% centroids,   201750 avg vectors): 0.9637 ± 0.1086
+
+--- Recall@100 ---
+Recall@   5 ( 0.10% centroids,     1974 avg vectors): 0.5620 ± 0.2818
+Recall@  11 ( 0.20% centroids,     4209 avg vectors): 0.6628 ± 0.2634
+Recall@  16 ( 0.30% centroids,     6058 avg vectors): 0.7087 ± 0.2514
+Recall@  22 ( 0.40% centroids,     8247 avg vectors): 0.7430 ± 0.2382
+Recall@  28 ( 0.50% centroids,    10439 avg vectors): 0.7655 ± 0.2291
+Recall@  33 ( 0.60% centroids,    12238 avg vectors): 0.7812 ± 0.2216
+Recall@  39 ( 0.70% centroids,    14402 avg vectors): 0.7963 ± 0.2143
+Recall@  45 ( 0.80% centroids,    16559 avg vectors): 0.8082 ± 0.2070
+Recall@  50 ( 0.90% centroids,    18378 avg vectors): 0.8165 ± 0.2016
+Recall@  56 ( 1.00% centroids,    20543 avg vectors): 0.8267 ± 0.1964
+Recall@  70 ( 1.25% centroids,    25592 avg vectors): 0.8418 ± 0.1877
+Recall@  84 ( 1.50% centroids,    30603 avg vectors): 0.8547 ± 0.1789
+Recall@  98 ( 1.75% centroids,    35642 avg vectors): 0.8656 ± 0.1709
+Recall@ 113 ( 2.00% centroids,    40999 avg vectors): 0.8747 ± 0.1639
+Recall@ 127 ( 2.25% centroids,    46031 avg vectors): 0.8820 ± 0.1576
+Recall@ 141 ( 2.50% centroids,    51038 avg vectors): 0.8892 ± 0.1515
+Recall@ 155 ( 2.75% centroids,    56052 avg vectors): 0.8954 ± 0.1447
+Recall@ 169 ( 3.00% centroids,    61050 avg vectors): 0.8996 ± 0.1410
+Recall@ 183 ( 3.25% centroids,    66004 avg vectors): 0.9038 ± 0.1370
+Recall@ 197 ( 3.50% centroids,    70997 avg vectors): 0.9077 ± 0.1338
+Recall@ 212 ( 3.75% centroids,    76339 avg vectors): 0.9121 ± 0.1291
+Recall@ 226 ( 4.00% centroids,    81305 avg vectors): 0.9160 ± 0.1257
+Recall@ 240 ( 4.25% centroids,    86285 avg vectors): 0.9193 ± 0.1227
+Recall@ 254 ( 4.50% centroids,    91273 avg vectors): 0.9221 ± 0.1201
+Recall@ 268 ( 4.75% centroids,    96262 avg vectors): 0.9250 ± 0.1177
+Recall@ 282 ( 5.00% centroids,   101264 avg vectors): 0.9279 ± 0.1140
+Recall@ 565 (10.00% centroids,   201750 avg vectors): 0.9570 ± 0.0807
+Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv
diff --git a/i8_xnnpack_benchmark_output.txt b/i8_xnnpack_benchmark_output.txt
new file mode 100644
index 0000000..edeaed9
--- /dev/null
+++ b/i8_xnnpack_benchmark_output.txt
@@ -0,0 +1,131 @@
+##########################################
+# DATASET: cohere2m
+##########################################
+
+==========================================
+Running benchmarks for cohere2m...
+==========================================
+
+----------------------------------------
+1/3: SuperKMeans
+----------------------------------------
+=== Running algorithm: superkmeans ===
+Dataset: cohere2m (n=2000000, d=1024)
+n_clusters=5656 n_iters=5 sampling_fraction=1
+Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled)
+Front dimensions (d') = 128
+Trailing dimensions (d'') = 768
+Sampling data...
+Using 2000000 vectors
+i8 vs f32 assignment agreement: 1982225 / 2000000 (99.1112%)
+Iteration 1/5 | Objective: 1.03122e+12 | Objective improvement: 0 | Shift: 2135.99 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1521795 / 2000000 (76.0897%)
+Iteration 2/5 | Objective: 7.45367e+11 | Objective improvement: 0.277202 | Shift: 104.263 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1546745 / 2000000 (77.3372%)
+Iteration 3/5 | Objective: 6.46113e+11 | Objective improvement: 0.133162 | Shift: 44.8106 | Split: 2 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1544260 / 2000000 (77.213%)
+Iteration 4/5 | Objective: 6.25703e+11 | Objective improvement: 0.0315884 | Shift: 26.0822 | Split: 0 | Recall: 0 [BLAS-only]
+
+i8 vs f32 assignment agreement: 1566088 / 2000000 (78.3044%)
+Iteration 5/5 | Objective: 6.08461e+11 | Objective improvement: 0.0275563 | Shift: 17.442 | Split: 0 | Recall: 0 [BLAS-only]
+
+
+========== PROFILER RESULTS ==========
+assign_f32                                 112.479s (44.681%) [5 calls]
+assign_i8                                   63.401s (25.185%) [5 calls]
+  - i8_knn                                  61.786s (24.544%) [5 calls]
+i8_knn                                      61.785s (24.543%) [5 calls]
+  - matmul                                  57.860s (22.984%) [735 calls]
+  - top_k                                    3.912s (1.554%) [735 calls]
+  - pack                                     0.001s (0.000%) [735 calls]
+update_centroids                             7.229s (2.871%) [5 calls]
+rotator                                      4.141s (1.645%) [3 calls]
+norms_calc                                   1.813s (0.720%) [13 calls]
+quantize_data                                0.710s (0.282%)
+consolidate                                  0.049s (0.019%) [5 calls]
+  - pdxify                                   0.052s (0.021%) [6 calls]
+  - splitting                                0.002s (0.001%) [5 calls]
+  - normalize                                0.000s (0.000%) [5 calls]
+generating_centroids                         0.047s (0.019%)
+fill                                         0.027s (0.011%) [10 calls]
+quantize_centroids                           0.019s (0.007%) [5 calls]
+shift                                        0.016s (0.006%) [5 calls]
+unrotator                                    0.014s (0.005%)
+compare_assignments                          0.007s (0.003%) [5 calls]
+compute_cost                                 0.002s (0.001%) [5 calls]
+allocator                                    0.000s (0.000%)
+-------------------------------------------
+TOTAL                                      251.737s
+===========================================
+
+Training completed in 189272.002 ms
+Actual iterations: 5 (requested: 5)
+Final objective: 608460800000.000
+
+--- Computing Recall ---
+Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json
+Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin
+Using 1000 queries (loaded 1000 from ground truth)
+Cluster size stats: mean=353.607, gmean=281.562, std=227.723, CV=0.644, min=8, max=2740
+
+--- Recall@10 ---
+Recall@   5 ( 0.10% centroids,     2894 avg vectors): 0.6197 ± 0.3485
+Recall@  11 ( 0.20% centroids,     6342 avg vectors): 0.7118 ± 0.3183
+Recall@  16 ( 0.30% centroids,     9231 avg vectors): 0.7537 ± 0.3011
+Recall@  22 ( 0.40% centroids,    12667 avg vectors): 0.7821 ± 0.2865
+Recall@  28 ( 0.50% centroids,    16088 avg vectors): 0.8023 ± 0.2722
+Recall@  33 ( 0.60% centroids,    18964 avg vectors): 0.8151 ± 0.2631
+Recall@  39 ( 0.70% centroids,    22388 avg vectors): 0.8269 ± 0.2567
+Recall@  45 ( 0.80% centroids,    25782 avg vectors): 0.8379 ± 0.2484
+Recall@  50 ( 0.90% centroids,    28614 avg vectors): 0.8484 ± 0.2414
+Recall@  56 ( 1.00% centroids,    32041 avg vectors): 0.8574 ± 0.2324
+Recall@  70 ( 1.25% centroids,    39931 avg vectors): 0.8728 ± 0.2175
+Recall@  84 ( 1.50% centroids,    47762 avg vectors): 0.8832 ± 0.2098
+Recall@  98 ( 1.75% centroids,    55554 avg vectors): 0.8925 ± 0.2012
+Recall@ 113 ( 2.00% centroids,    63861 avg vectors): 0.8989 ± 0.1947
+Recall@ 127 ( 2.25% centroids,    71554 avg vectors): 0.9048 ± 0.1884
+Recall@ 141 ( 2.50% centroids,    79228 avg vectors): 0.9094 ± 0.1818
+Recall@ 155 ( 2.75% centroids,    86888 avg vectors): 0.9131 ± 0.1785
+Recall@ 169 ( 3.00% centroids,    94466 avg vectors): 0.9179 ± 0.1730
+Recall@ 183 ( 3.25% centroids,   102100 avg vectors): 0.9239 ± 0.1651
+Recall@ 197 ( 3.50% centroids,   109666 avg vectors): 0.9272 ± 0.1603
+Recall@ 212 ( 3.75% centroids,   117777 avg vectors): 0.9309 ± 0.1543
+Recall@ 226 ( 4.00% centroids,   125213 avg vectors): 0.9343 ± 0.1498
+Recall@ 240 ( 4.25% centroids,   132720 avg vectors): 0.9369 ± 0.1452
+Recall@ 254 ( 4.50% centroids,   140198 avg vectors): 0.9386 ± 0.1429
+Recall@ 268 ( 4.75% centroids,   147647 avg vectors): 0.9406 ± 0.1410
+Recall@ 282 ( 5.00% centroids,   155039 avg vectors): 0.9429 ± 0.1370
+Recall@ 565 (10.00% centroids,   300209 avg vectors): 0.9694 ± 0.0981
+
+--- Recall@100 ---
+Recall@   5 ( 0.10% centroids,     2894 avg vectors): 0.5502 ± 0.2816
+Recall@  11 ( 0.20% centroids,     6342 avg vectors): 0.6546 ± 0.2636
+Recall@  16 ( 0.30% centroids,     9231 avg vectors): 0.7005 ± 0.2513
+Recall@  22 ( 0.40% centroids,    12667 avg vectors): 0.7330 ± 0.2409
+Recall@  28 ( 0.50% centroids,    16088 avg vectors): 0.7587 ± 0.2305
+Recall@  33 ( 0.60% centroids,    18964 avg vectors): 0.7750 ± 0.2227
+Recall@  39 ( 0.70% centroids,    22388 avg vectors): 0.7902 ± 0.2156
+Recall@  45 ( 0.80% centroids,    25782 avg vectors): 0.8039 ± 0.2077
+Recall@  50 ( 0.90% centroids,    28614 avg vectors): 0.8140 ± 0.2014
+Recall@  56 ( 1.00% centroids,    32041 avg vectors): 0.8229 ± 0.1958
+Recall@  70 ( 1.25% centroids,    39931 avg vectors): 0.8418 ± 0.1841
+Recall@  84 ( 1.50% centroids,    47762 avg vectors): 0.8544 ± 0.1756
+Recall@  98 ( 1.75% centroids,    55554 avg vectors): 0.8658 ± 0.1680
+Recall@ 113 ( 2.00% centroids,    63861 avg vectors): 0.8751 ± 0.1610
+Recall@ 127 ( 2.25% centroids,    71554 avg vectors): 0.8830 ± 0.1555
+Recall@ 141 ( 2.50% centroids,    79228 avg vectors): 0.8907 ± 0.1470
+Recall@ 155 ( 2.75% centroids,    86888 avg vectors): 0.8964 ± 0.1424
+Recall@ 169 ( 3.00% centroids,    94466 avg vectors): 0.9020 ± 0.1369
+Recall@ 183 ( 3.25% centroids,   102100 avg vectors): 0.9072 ± 0.1326
+Recall@ 197 ( 3.50% centroids,   109666 avg vectors): 0.9114 ± 0.1290
+Recall@ 212 ( 3.75% centroids,   117777 avg vectors): 0.9156 ± 0.1245
+Recall@ 226 ( 4.00% centroids,   125213 avg vectors): 0.9197 ± 0.1207
+Recall@ 240 ( 4.25% centroids,   132720 avg vectors): 0.9232 ± 0.1165
+Recall@ 254 ( 4.50% centroids,   140198 avg vectors): 0.9263 ± 0.1127
+Recall@ 268 ( 4.75% centroids,   147647 avg vectors): 0.9291 ± 0.1097
+Recall@ 282 ( 5.00% centroids,   155039 avg vectors): 0.9324 ± 0.1060
+Recall@ 565 (10.00% centroids,   300209 avg vectors): 0.9623 ± 0.0715
+Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv
\ No newline at end of file
diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h
index 1eea344..a39bd45 100644
--- a/include/superkmeans/superkmeans.h
+++ b/include/superkmeans/superkmeans.h
@@ -948,9 +948,9 @@ class SuperKMeans {
         size_t& iter_idx,
         const bool is_first_iter,
         std::vector<SuperKMeansIterationStats>& target_stats,
-        const bool run_quantized_assignment = true,
+        const bool run_quantized_assignment = false,
         const bool run_f32_assignment = true,
-        const bool use_quantized_assignment_for_update = true
+        const bool use_quantized_assignment_for_update = false
     ) {
         if (!is_first_iter) {
             std::swap(horizontal_centroids, prev_centroids);
@@ -983,7 +983,6 @@ class SuperKMeans {
             if (run_quantized_assignment && run_f32_assignment) {
                 SKM_PROFILE_SCOPE("compare_assignments");
                 size_t match = 0;
-                size_t mismatches_printed = 0;
                 for (size_t s = 0; s < n_samples; ++s) {
                     if (i8_assignments[s] == assignments[s]) {
                         ++match;