From c5ffb612a0fe7ccf999fb1a0d67ad3410044da9e Mon Sep 17 00:00:00 2001 From: Joseph Hwang Date: Sun, 22 Mar 2026 21:51:47 -0700 Subject: [PATCH 1/5] Various changes --- CMakeLists.txt | 31 ++- .../end_to_end/end_to_end_superkmeans.cpp | 3 +- .../distance_computers/batch_computers.h | 227 +++++++++++------ include/superkmeans/superkmeans.h | 229 +++++++++++++++++- 4 files changed, 402 insertions(+), 88 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4090b8..3b666f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,20 @@ set(SKMEANS_COMPILE_BENCHMARKS OFF CACHE BOOL "Whether to compile benchmarks") set(SKMEANS_ENABLE_GPU OFF CACHE BOOL "Whether to use the GPU-based implementation of SuperKMeans") set(SKMEANS_COMPILE_EXAMPLES ON CACHE BOOL "Whether to compile examples") -find_package(OpenMP REQUIRED) + +# Apple Clang does not bundle OpenMP; point FindOpenMP at Homebrew's libomp. +if(APPLE) + execute_process(COMMAND brew --prefix libomp + OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) + if(HOMEBREW_LIBOMP_PREFIX) + set(OpenMP_CXX_FLAGS "-Xclang -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include" + CACHE STRING "" FORCE) + set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE) + set(OpenMP_omp_LIBRARY "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib" + CACHE FILEPATH "" FORCE) + endif() +endif() +find_package(OpenMP REQUIRED COMPONENTS CXX) list(PREPEND CMAKE_PREFIX_PATH /usr/local) @@ -69,7 +82,19 @@ else() message(STATUS "GPU disabled") endif() -add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}") +set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(XNNPACK_BUILD_ALL_MICROKERNELS OFF CACHE BOOL "" FORCE) +FetchContent_Declare( + xnnpack + GIT_REPOSITORY https://github.com/google/XNNPACK.git + GIT_TAG master + GIT_SHALLOW TRUE +) +FetchContent_MakeAvailable(xnnpack) +set(XNNPACK_LINK_LIBRARIES XNNPACK pthreadpool) + +add_compile_definitions(CMAKE_SOURCE_DIR="${CMAKE_SOURCE_DIR}" BENCHMARK_TIME) include_directories(include extern/Eigen) if(SKMEANS_COMPILE_TESTS) @@ -120,6 +145,8 @@ if(Python_FOUND AND pybind11_FOUND) target_link_libraries(_superkmeans PRIVATE ${BLAS_LINK_LIBRARIES}) endif() + target_link_libraries(_superkmeans PRIVATE ${XNNPACK_LINK_LIBRARIES}) + if(FFTW_FLOAT_LIB_FOUND) target_link_libraries(_superkmeans PRIVATE ${FFTW_FLOAT_LIB} ${FFTW_FLOAT_OPENMP_LIB}) endif() diff --git a/benchmarks/end_to_end/end_to_end_superkmeans.cpp b/benchmarks/end_to_end/end_to_end_superkmeans.cpp index fc0b176..9b50099 100644 --- a/benchmarks/end_to_end/end_to_end_superkmeans.cpp +++ b/benchmarks/end_to_end/end_to_end_superkmeans.cpp @@ -67,13 +67,14 @@ int main(int argc, char* argv[]) { skmeans::SuperKMeansConfig config; config.iters = n_iters; - config.verbose = false; + config.verbose = true; config.n_threads = THREADS; config.objective_k = 100; config.ann_explore_fraction = 0.01f; config.unrotate_centroids = true; config.early_termination = false; config.sampling_fraction = sampling_fraction; + // TODO: For joseph experiment config.use_blas_only = false; auto is_angular = std::find( diff --git a/include/superkmeans/distance_computers/batch_computers.h b/include/superkmeans/distance_computers/batch_computers.h index 12ba03d..2d96ab0 100644 --- a/include/superkmeans/distance_computers/batch_computers.h +++ b/include/superkmeans/distance_computers/batch_computers.h @@ -2,34 +2,17 @@ #include #include +#include +#include #include #include "superkmeans/common.h" #include "superkmeans/distance_computers/base_computers.h" +#include "superkmeans/distance_computers/matmul.h" #include "superkmeans/pdx/layout.h" #include "superkmeans/profiler.h" #include -// Eigen already declares sgemm_, so we don't need to redeclare it -// TODO(lkuffo, low): However, I would like to have more control over this -extern "C" { -/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */ -// int sgemm_( -// const char* transa, -// const char* transb, -// FINTEGER* m, -// FINTEGER* n, -// FINTEGER* k, -// const float* alpha, -// const float* a, -// FINTEGER* lda, -// const float* b, -// FINTEGER* ldb, -// float* beta, -// float* c, -// FINTEGER* ldc); -} - namespace skmeans { template @@ -38,6 +21,145 @@ class BatchComputer {}; template <> class BatchComputer {}; +/** + * @brief Finds the top-1 nearest neighbor for each query vector (int8 data). + * + * Computes L2 distances between X and Y using int8 dot products via XNNPACK: + * ||x-y||² = ||x||² + ||y||² - 2*x·y + * + * @param x Query vectors in row-major layout (n_x × d), int8 + * @param y Reference vectors in row-major layout (n_y × d), int8 + * @param n_x Number of query vectors + * @param n_y Number of reference vectors + * @param d Dimensionality + * @param norms_x Pre-computed squared L2 norms of query vectors (float) + * @param norms_y Pre-computed squared L2 norms of reference vectors (float) + * @param out_knn Output: index of nearest neighbor for each query + * @param out_distances Output: distance to nearest neighbor for each query + * @param tmp_distances_buf Buffer for batch distance computation (size: X_BATCH_SIZE × Y_BATCH_SIZE) + */ +/** + * @brief Finds the top-k nearest neighbors for each query vector (int8 data). + * + * Computes L2 distances between X and Y using int8 dot products via XNNPACK: + * ||x-y||² = ||x||² + ||y||² - 2*x·y + * For k=1, uses vectorized minCoeff. For k>1, maintains a sorted top-k array + * per query with threshold-based rejection. + * + * @param x Query vectors in row-major layout (n_x × d), int8 + * @param y Reference vectors in row-major layout (n_y × d), int8 + * @param n_x Number of query vectors + * @param n_y Number of reference vectors + * @param d Dimensionality + * @param norms_x Pre-computed squared L2 norms of query vectors (float) + * @param norms_y Pre-computed squared L2 norms of reference vectors (float) + * @param k Number of nearest neighbors to find + * @param out_knn Output: indices of k nearest neighbors per query (size: n_x × k) + * @param out_distances Output: distances to k nearest neighbors per query (size: n_x × k) + * @param tmp_distances_buf Buffer for batch distance computation (size: X_BATCH_SIZE × Y_BATCH_SIZE) + */ +inline void FindKNearestNeighborsI8( + const int8_t* SKM_RESTRICT x, + const int8_t* SKM_RESTRICT y, + const size_t n_x, + const size_t n_y, + const size_t d, + const float* SKM_RESTRICT norms_x, + const float* SKM_RESTRICT norms_y, + const size_t k, + uint32_t* SKM_RESTRICT out_knn, + float* SKM_RESTRICT out_distances, + float* SKM_RESTRICT tmp_distances_buf +) { + using MatrixR = Eigen::Matrix; + matmul::InitXnnpack(); + SKM_PROFILE_SCOPE("blas_i8_knn"); + std::fill_n(out_distances, n_x * k, std::numeric_limits::max()); + std::fill_n(out_knn, n_x * k, static_cast(-1)); + + for (size_t i = 0; i < n_x; i += X_BATCH_SIZE) { + auto batch_n_x = X_BATCH_SIZE; + auto batch_x_p = x + (i * d); + if (i + X_BATCH_SIZE > n_x) { + batch_n_x = n_x - i; + } + for (size_t j = 0; j < n_y; j += Y_BATCH_SIZE) { + auto batch_n_y = Y_BATCH_SIZE; + auto batch_y_p = y + (j * d); + if (j + Y_BATCH_SIZE > n_y) { + batch_n_y = n_y - j; + } + { + SKM_PROFILE_SCOPE("blas_i8_knn/gemm"); + matmul::XnnpackMatmulI8F32( + batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf + ); + } + Eigen::Map distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y); + + SKM_PROFILE_SCOPE("blas_i8_knn/top_k"); +#pragma omp parallel for num_threads(g_n_threads) + for (size_t r = 0; r < batch_n_x; ++r) { + const auto i_idx = i + r; + const float norm_x_i = norms_x[i_idx]; + float* row_p = distances_matrix.data() + r * batch_n_y; +#pragma clang loop vectorize(enable) + for (size_t c = 0; c < batch_n_y; ++c) { + row_p[c] = -2.0f * row_p[c] + norm_x_i + norms_y[j + c]; + } + + float* out_dist_row = out_distances + i_idx * k; + uint32_t* out_knn_row = out_knn + i_idx * k; + + if (k == 1) { + uint32_t min_idx; + auto min_dist = distances_matrix.row(r).minCoeff(&min_idx); + if (min_dist < out_dist_row[0]) { + out_dist_row[0] = std::max(0.0f, min_dist); + out_knn_row[0] = static_cast(j + min_idx); + } + continue; + } + + float threshold = out_dist_row[k - 1]; + for (size_t c = 0; c < batch_n_y; ++c) { + float dist = std::max(0.0f, row_p[c]); + if (dist >= threshold) continue; + + // Find where the new candidate belongs + size_t pos = 0; + for (; pos < k; ++pos) { + if (dist < out_dist_row[pos]) break; + } + // Shift worse entries down by one (evicts the k-th) + for (size_t ki = k - 1; ki > pos; --ki) { + out_dist_row[ki] = out_dist_row[ki - 1]; + out_knn_row[ki] = out_knn_row[ki - 1]; + } + out_dist_row[pos] = dist; + out_knn_row[pos] = static_cast(j + c); + threshold = out_dist_row[k - 1]; + } + } + } + } +} + +inline void FindNearestNeighborI8( + const int8_t* SKM_RESTRICT x, + const int8_t* SKM_RESTRICT y, + const size_t n_x, + const size_t n_y, + const size_t d, + const float* SKM_RESTRICT norms_x, + const float* SKM_RESTRICT norms_y, + uint32_t* SKM_RESTRICT out_knn, + float* SKM_RESTRICT out_distances, + float* SKM_RESTRICT tmp_distances_buf +) { + FindKNearestNeighborsI8(x, y, n_x, n_y, d, norms_x, norms_y, 1, out_knn, out_distances, tmp_distances_buf); +} + template <> class BatchComputer { @@ -49,59 +171,6 @@ class BatchComputer { using MatrixR = Eigen::Matrix; using MatrixC = Eigen::Matrix; - private: - /** - * @brief Performs BLAS matrix multiplication: distances = x * y^T - * - * Computes the dot product matrix between X and Y - * Can optionally use only the first partial_d dimensions for partial distance computation. - * - * @param batch_x_p Pointer to query vectors batch (batch_n_x × d) - * @param batch_y_p Pointer to reference vectors batch (batch_n_y × d) - * @param batch_n_x Number of query vectors in batch - * @param batch_n_y Number of reference vectors in batch - * @param d Full dimensionality - * @param partial_d Number of dimensions to use (if 0, uses all dimensions) - * @param tmp_distances_buf Output buffer for distance matrix (batch_n_x × batch_n_y) - */ - static void BlasMatrixMultiplication( - const data_t* SKM_RESTRICT batch_x_p, - const data_t* SKM_RESTRICT batch_y_p, - const size_t batch_n_x, - const size_t batch_n_y, - const size_t d, - const size_t partial_d, - float* SKM_RESTRICT tmp_distances_buf - ) { - const char trans_a = 'T'; - const char trans_b = 'N'; - - int m = static_cast(batch_n_y); - int n = static_cast(batch_n_x); - int k = static_cast(partial_d > 0 && partial_d < d ? partial_d : d); - float alpha = 1.0f; - float beta = 0.0f; - int lda = static_cast(d); // d of y (row stride in row-major) - int ldb = static_cast(d); // d of x (row stride in row-major) - int ldc = static_cast(batch_n_y); // Leading dimension of tmp_distances_buf - - sgemm_( - &trans_a, - &trans_b, - &m, - &n, - &k, - &alpha, - batch_y_p, - &lda, - batch_x_p, - &ldb, - &beta, - tmp_distances_buf, - &ldc - ); - } - public: /** * @brief Finds the top-1 nearest neighbor for each query vector. @@ -133,8 +202,6 @@ class BatchComputer { distance_t* SKM_RESTRICT out_distances, float* SKM_RESTRICT tmp_distances_buf ) { - SKM_PROFILE_SCOPE("search"); - SKM_PROFILE_SCOPE("search/1st_blas"); std::fill_n(out_distances, n_x, std::numeric_limits::max()); for (size_t i = 0; i < n_x; i += X_BATCH_SIZE) { auto batch_n_x = X_BATCH_SIZE; @@ -154,7 +221,7 @@ class BatchComputer { #pragma omp parallel for num_threads(g_n_threads) schedule(static) for (size_t r = 0; r < batch_n_x; r += MINI_BATCH_SIZE) { auto mini_batch_n_x = std::min(MINI_BATCH_SIZE, batch_n_x - r); - BlasMatrixMultiplication( + matmul::MatrixMultiplication( batch_x_p + r * d, batch_y_p, mini_batch_n_x, @@ -165,7 +232,7 @@ class BatchComputer { ); } #else - BlasMatrixMultiplication( + matmul::MatrixMultiplication( batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf ); #endif @@ -245,7 +312,7 @@ class BatchComputer { batch_n_y = n_y - j; } - BlasMatrixMultiplication( + matmul::MatrixMultiplication( batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf ); Eigen::Map distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y); @@ -351,7 +418,7 @@ class BatchComputer { #pragma omp parallel for num_threads(g_n_threads) schedule(static) for (size_t r = 0; r < batch_n_x; r += MINI_BATCH_SIZE) { auto mini_batch_n_x = std::min(MINI_BATCH_SIZE, batch_n_x - r); - BlasMatrixMultiplication( + matmul::MatrixMultiplication( batch_x_p + r * d, batch_y_p, mini_batch_n_x, @@ -362,7 +429,7 @@ class BatchComputer { ); } #else - BlasMatrixMultiplication( + matmul::MatrixMultiplication( batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf ); #endif diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h index f0cc253..b0e7faf 100644 --- a/include/superkmeans/superkmeans.h +++ b/include/superkmeans/superkmeans.h @@ -661,6 +661,104 @@ class SuperKMeans { } } + /** + * @brief Performs assignment using quantized (int8) distance computation. + * + * Quantizes data (once) and centroids, finds top-k candidates via i8 GEMM, + * then reranks with f32 distances when k > 1. Writes final assignments to + * assignments/distances members and zeros accumulators for UpdateCentroids. + * + * @param data Data matrix (row-major, n_samples × d), used for reranking + * @param centroids f32 centroids (row-major, n_clusters × d), used for quantization and reranking + * @param n_samples Number of vectors in the data + * @param n_clusters Number of centroids + */ + void QuantizedAssignAndUpdateCentroids( + const vector_value_t* SKM_RESTRICT data, + const vector_value_t* SKM_RESTRICT centroids, + const size_t n_samples, + const size_t n_clusters, + uint32_t* SKM_RESTRICT out_assignments, + distance_t* SKM_RESTRICT out_distances + ) { + if (quantized_data.empty()) { + SKM_PROFILE_SCOPE("quantize_data"); + quantized_data.resize(n_samples * d); + quantized_data_norms.reset(new vector_value_t[n_samples]); + QuantizeEmbeddings(data, n_samples, d, quantized_data.data()); + GetQuantizedL2NormsRowMajor(quantized_data.data(), n_samples, quantized_data_norms.get()); + } + + std::vector quantized_centroids_buf; + std::unique_ptr quantized_centroid_norms; + { + SKM_PROFILE_SCOPE("quantize_centroids"); + quantized_centroids_buf.resize(n_clusters * d); + quantized_centroid_norms.reset(new vector_value_t[n_clusters]); + QuantizeEmbeddings(centroids, n_clusters, d, quantized_centroids_buf.data()); + GetQuantizedL2NormsRowMajor(quantized_centroids_buf.data(), n_clusters, quantized_centroid_norms.get()); + } + + const size_t i8_k = 2; + std::vector i8_knn(n_samples * i8_k); + std::vector i8_knn_distances(n_samples * i8_k); + std::vector i8_tmp_distances_buf(X_BATCH_SIZE * Y_BATCH_SIZE); + { + SKM_PROFILE_SCOPE("assign_i8/knn"); + FindKNearestNeighborsI8( + quantized_data.data(), + quantized_centroids_buf.data(), + n_samples, + n_clusters, + d, + quantized_data_norms.get(), + quantized_centroid_norms.get(), + i8_k, + i8_knn.data(), + i8_knn_distances.data(), + i8_tmp_distances_buf.data() + ); + } + + if (i8_k > 1) { + SKM_PROFILE_SCOPE("assign_i8/rerank"); +#pragma omp parallel for num_threads(g_n_threads) + for (size_t s = 0; s < n_samples; ++s) { + const float* __restrict__ point = data + s * d; + float best_dist = std::numeric_limits::max(); + uint32_t best_idx = 0; + for (size_t ki = 0; ki < i8_k; ++ki) { + uint32_t c_idx = i8_knn[s * i8_k + ki]; + if (c_idx == static_cast(-1)) break; + const float* __restrict__ cent = centroids + c_idx * d; + float dist = 0; +#pragma omp simd reduction(+:dist) + for (size_t dim = 0; dim < d; ++dim) { + float diff = point[dim] - cent[dim]; + dist += diff * diff; + } + if (dist < best_dist) { + best_dist = dist; + best_idx = c_idx; + } + } + out_assignments[s] = best_idx; + out_distances[s] = best_dist; + } + } else { + for (size_t s = 0; s < n_samples; ++s) { + out_assignments[s] = i8_knn[s]; + out_distances[s] = i8_knn_distances[s]; + } + } + + { + SKM_PROFILE_SCOPE("fill"); + std::fill(horizontal_centroids.get(), horizontal_centroids.get() + (n_clusters * d), 0.0); + std::fill(cluster_sizes.get(), cluster_sizes.get() + n_clusters, 0); + } + } + /** * @brief Performs assignment and centroid update using GEMM+PRUNING. * @@ -756,6 +854,67 @@ class SuperKMeans { } } + void GetQuantizedL2NormsRowMajor( + const int8_t* SKM_RESTRICT data, + const size_t n, + vector_value_t* SKM_RESTRICT out_norm + ) { + SKM_PROFILE_SCOPE("norms_calc"); + for (size_t i = 0; i < n; ++i) { + const int8_t* row = data + i * d; + int32_t acc = 0; + #pragma clang loop vectorize(enable) + for (size_t j = 0; j < d; ++j) { + acc += static_cast(row[j]) * static_cast(row[j]); + } + out_norm[i] = static_cast(acc); + } + } + + static void QuantizeEmbedding( + const float* embedding, + const size_t num_dimensions, + const float quantization_base, + const float quantization_scale, + int8_t* output_quantized_embedding + ) { +#pragma clang loop vectorize(enable) + for (size_t i = 0; i < num_dimensions; ++i) { + float scaled = (embedding[i] - quantization_base) * quantization_scale; + uint8_t u = static_cast(std::min(nearbyintf(scaled), 255.0f)); + output_quantized_embedding[i] = static_cast(u - 128); + } + } + + static void QuantizeEmbeddings( + const float* embeddings, + const size_t total_elements, + const size_t num_dimensions, + int8_t* output_quantized_embeddings + ) { + const size_t total_values = total_elements * num_dimensions; + float global_min = std::numeric_limits::max(); + float global_max = std::numeric_limits::lowest(); +#pragma omp parallel for num_threads(g_n_threads) reduction(min:global_min) reduction(max:global_max) + for (size_t i = 0; i < total_values; ++i) { + global_min = std::min(global_min, embeddings[i]); + global_max = std::max(global_max, embeddings[i]); + } + const float range = global_max - global_min; + const float scaling_factor = (range > 0) ? 255.0f / range : 1.0f; + +#pragma omp parallel for num_threads(g_n_threads) schedule(static) + for (size_t i = 0; i < total_elements; ++i) { + QuantizeEmbedding( + &embeddings[i * num_dimensions], + num_dimensions, + global_min, + scaling_factor, + &output_quantized_embeddings[i * num_dimensions] + ); + } + } + /** * @brief Runs a single K-Means iteration with either GEMM-only or GEMM+PRUNING strategy. * @@ -788,17 +947,73 @@ class SuperKMeans { const size_t n_clusters, size_t& iter_idx, const bool is_first_iter, - std::vector& target_stats + std::vector& target_stats, + const bool run_quantized_assignment = false, + const bool run_f32_assignment = true, + const bool use_quantized_assignment_for_update = false ) { + std::cout << "Training centroids iteration=" << iter_idx << std::endl; + if (!is_first_iter) { std::swap(horizontal_centroids, prev_centroids); } if constexpr (GEMM_ONLY) { - GetL2NormsRowMajor(prev_centroids.get(), n_clusters, centroid_norms.get()); - FirstAssignAndUpdateCentroids( - data_to_cluster, prev_centroids.get(), tmp_distances_buf, n_samples, n_clusters - ); + std::cout << "Running GEMM loop" << std::endl; + + // f32 assignment path + if (run_f32_assignment) { + SKM_PROFILE_SCOPE("assign_f32"); + GetL2NormsRowMajor(prev_centroids.get(), n_clusters, centroid_norms.get()); + FirstAssignAndUpdateCentroids( + data_to_cluster, prev_centroids.get(), tmp_distances_buf, n_samples, n_clusters + ); + } + + // Quantized (i8) assignment path + std::vector i8_assignments; + std::vector i8_distances; + if (run_quantized_assignment) { + i8_assignments.resize(n_samples); + i8_distances.assign(n_samples, std::numeric_limits::max()); + SKM_PROFILE_SCOPE("assign_i8"); + QuantizedAssignAndUpdateCentroids( + data_to_cluster, prev_centroids.get(), n_samples, n_clusters, + i8_assignments.data(), i8_distances.data() + ); + } + + // Compare i8 vs f32 assignments when both were computed + if (run_quantized_assignment && run_f32_assignment) { + SKM_PROFILE_SCOPE("compare_assignments"); + size_t match = 0; + size_t mismatches_printed = 0; + for (size_t s = 0; s < n_samples; ++s) { + if (i8_assignments[s] == assignments[s]) { + ++match; + continue; + } + if (mismatches_printed < 10) { + std::cout << " mismatch[" << mismatches_printed << "] sample=" << s + << " i8_assign=" << i8_assignments[s] + << " f32_assign=" << assignments[s] + << " i8_dist=" << i8_distances[s] + << " f32_dist=" << distances[s] + << " delta=" << (i8_distances[s] - distances[s]) << std::endl; + ++mismatches_printed; + } + } + std::cout << "i8 vs f32 assignment agreement: " << match << " / " << n_samples + << " (" << (100.0 * match / n_samples) << "%)" << std::endl; + } + + // Choose which assignments drive centroid updates + if (use_quantized_assignment_for_update) { + assert(!i8_assignments.empty() && i8_assignments.size() == n_samples); + std::copy(i8_assignments.begin(), i8_assignments.end(), assignments.get()); + std::copy(i8_distances.begin(), i8_distances.end(), distances.get()); + } + } else { GetPartialL2NormsRowMajor( prev_centroids.get(), n_clusters, centroids_partial_norms.data(), partial_d @@ -1471,6 +1686,10 @@ class SuperKMeans { std::unique_ptr centroid_norms; std::unique_ptr sampled_indices; + // Quantized data (computed once, reused across iterations) + std::vector quantized_data; + std::unique_ptr quantized_data_norms; + // Buffers for ground truth and recall computation std::unique_ptr gt_assignments; std::unique_ptr gt_distances; From 6448e3c46a803846a90e528e79eb8592f65dfd8c Mon Sep 17 00:00:00 2001 From: Joseph Hwang Date: Sun, 22 Mar 2026 22:40:30 -0700 Subject: [PATCH 2/5] Matmul --- .../superkmeans/distance_computers/matmul.h | 314 ++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 include/superkmeans/distance_computers/matmul.h diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h new file mode 100644 index 0000000..4aa0bdc --- /dev/null +++ b/include/superkmeans/distance_computers/matmul.h @@ -0,0 +1,314 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "superkmeans/common.h" +#include "superkmeans/profiler.h" + +#include +#include +#include + +#if defined(__aarch64__) +#include +#endif + +// Eigen already declares sgemm_, so we don't need to redeclare it +extern "C" {} + +namespace skmeans { +namespace matmul { + +inline bool UseXnnpack() { + static const bool enabled = (std::getenv("USE_XNNPACK") != nullptr); + return enabled; +} + +inline bool InitXnnpack() { + static const bool ok = (xnn_initialize(nullptr) == xnn_status_success); + return ok; +} + +/** + * @brief Packs kernel rows into a contiguous partial-dimension buffer for XNNPACK. + * + * When partial_d < d, XNNPACK requires the kernel to be contiguous N × partial_d. + * This copies the first partial_d elements from each of the n_rows rows (stride d) + * into a thread-local contiguous buffer and returns a pointer to it. + * When partial_d == d, returns the original pointer with no copy. + * + * @return Pointer to contiguous kernel data and the effective dimension count. + */ +template +inline std::pair PackKernelForPartialD( + const T* kernel, size_t n_rows, size_t d, size_t partial_d +) { + const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d; + if (partial_d_ == d) { + return {kernel, d}; + } + thread_local std::vector buf; + buf.resize(n_rows * partial_d_ * sizeof(T)); + auto* dst = reinterpret_cast(buf.data()); + for (size_t i = 0; i < n_rows; ++i) { + std::memcpy(dst + i * partial_d_, kernel + i * d, partial_d_ * sizeof(T)); + } + return {dst, partial_d_}; +} + +/** + * @brief Performs matrix multiplication using XNNPACK's fully-connected operator: C = X * Y^T + * + * Uses XNNPACK's NC F32 fully-connected operator where X is the "input" (batch_n_x × d) + * and Y is the "kernel" (batch_n_y × d). Supports partial-dimension computation via partial_d: + * the input (X) uses XNNPACK's input_stride to skip trailing dimensions without copying, + * while the kernel (Y) rows are packed into a contiguous buffer when partial_d < d. + * + * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions. + */ +inline void XnnpackMatmulF32( + const float* SKM_RESTRICT batch_x_p, + const float* SKM_RESTRICT batch_y_p, + const size_t batch_n_x, + const size_t batch_n_y, + const size_t d, + const size_t partial_d, + float* SKM_RESTRICT tmp_distances_buf +) { + SKM_PROFILE_SCOPE("xnnpack_f32"); + const float* kernel_p; + size_t partial_d_; + { + SKM_PROFILE_SCOPE("xnnpack_f32/pack"); + std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d); + } + + { + SKM_PROFILE_SCOPE("xnnpack_f32/matmul"); + thread_local pthreadpool_t tp = pthreadpool_create(omp_get_max_threads()); + xnn_operator_t op = nullptr; + xnn_create_fully_connected_nc_f32( + partial_d_, batch_n_y, d, batch_n_y, + kernel_p, nullptr, + -std::numeric_limits::infinity(), + std::numeric_limits::infinity(), + 0, nullptr, &op + ); + xnn_reshape_fully_connected_nc_f32(op, batch_n_x, tp); + xnn_setup_fully_connected_nc_f32(op, batch_x_p, tmp_distances_buf); + xnn_run_operator(op, tp); + xnn_delete_operator(op); + } +} + +/** + * @brief Performs int8→float32 matrix multiplication using XNNPACK: C = X * Y^T + * + * Uses XNNPACK's dynamic-quantized int8 fully-connected operator (qd8_f32_qc8w). + * Identity quantization (scale=1.0, zero_point=0) so the result is the integer dot product + * cast to float. Supports partial-dimension computation via partial_d. + * + * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions. + */ +inline void XnnpackMatmulI8F32( + const int8_t* SKM_RESTRICT batch_x_p, + const int8_t* SKM_RESTRICT batch_y_p, + const size_t batch_n_x, + const size_t batch_n_y, + const size_t d, + const size_t partial_d, + float* SKM_RESTRICT tmp_distances_buf +) { + SKM_PROFILE_SCOPE("xnnpack_i8f32"); + const int8_t* kernel_p; + size_t partial_d_; + { + SKM_PROFILE_SCOPE("xnnpack_i8f32/pack"); + std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d); + } + + { + SKM_PROFILE_SCOPE("xnnpack_i8f32/matmul"); + thread_local std::vector kernel_scale; + kernel_scale.assign(batch_n_y, 1.0f); + + thread_local std::vector qparams; + qparams.assign(batch_n_x, {0, 1.0f}); + + thread_local pthreadpool_t tp_i8 = pthreadpool_create(omp_get_max_threads()); + xnn_operator_t op = nullptr; + xnn_create_fully_connected_nc_qd8_f32_qc8w( + partial_d_, batch_n_y, d, batch_n_y, + kernel_scale.data(), kernel_p, nullptr, + -std::numeric_limits::infinity(), + std::numeric_limits::infinity(), + 0, nullptr, &op + ); + size_t ws_size = 0; + xnn_reshape_fully_connected_nc_qd8_f32_qc8w(op, batch_n_x, &ws_size, tp_i8); + thread_local std::vector ws; + ws.resize(ws_size); + xnn_setup_fully_connected_nc_qd8_f32_qc8w( + op, batch_x_p, tmp_distances_buf, ws.data(), qparams.data() + ); + xnn_run_operator(op, tp_i8); + xnn_delete_operator(op); + } +} + +#if defined(__aarch64__) +/** + * @brief Performs int8 matrix multiplication using NEON SDOT: C = X * Y^T + * + * Hand-written kernel using vdotq_s32 (SDOT) instructions for int8 dot products. + * Both X and Y are row-major with stride d; only the first partial_d dimensions are used. + * Output is int32 accumulation of the dot products. + */ +inline void NeonSdotMatmulI8( + const int8_t* SKM_RESTRICT batch_x_p, + const int8_t* SKM_RESTRICT batch_y_p, + const size_t batch_n_x, + const size_t batch_n_y, + const size_t d, + const size_t partial_d, + int32_t* SKM_RESTRICT tmp_distances_buf +) { + const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d; + const size_t K16 = partial_d_ & ~(size_t)15; + +#pragma omp parallel for schedule(static) num_threads(g_n_threads) + for (size_t m = 0; m < batch_n_x; m++) { + const int8_t* a_row = batch_x_p + m * d; + int32_t* c_row = tmp_distances_buf + m * batch_n_y; + + size_t n = 0; + for (; n + 4 <= batch_n_y; n += 4) { + const int8_t* b0 = batch_y_p + (n + 0) * d; + const int8_t* b1 = batch_y_p + (n + 1) * d; + const int8_t* b2 = batch_y_p + (n + 2) * d; + const int8_t* b3 = batch_y_p + (n + 3) * d; + + int32x4_t acc0 = vdupq_n_s32(0); + int32x4_t acc1 = vdupq_n_s32(0); + int32x4_t acc2 = vdupq_n_s32(0); + int32x4_t acc3 = vdupq_n_s32(0); + + for (size_t k = 0; k < K16; k += 16) { + int8x16_t va = vld1q_s8(a_row + k); + acc0 = vdotq_s32(acc0, va, vld1q_s8(b0 + k)); + acc1 = vdotq_s32(acc1, va, vld1q_s8(b1 + k)); + acc2 = vdotq_s32(acc2, va, vld1q_s8(b2 + k)); + acc3 = vdotq_s32(acc3, va, vld1q_s8(b3 + k)); + } + + int32_t s0 = vaddvq_s32(acc0), s1 = vaddvq_s32(acc1); + int32_t s2 = vaddvq_s32(acc2), s3 = vaddvq_s32(acc3); + + for (size_t k = K16; k < partial_d_; k++) { + int32_t a_val = a_row[k]; + s0 += a_val * static_cast(b0[k]); + s1 += a_val * static_cast(b1[k]); + s2 += a_val * static_cast(b2[k]); + s3 += a_val * static_cast(b3[k]); + } + + c_row[n + 0] = s0; + c_row[n + 1] = s1; + c_row[n + 2] = s2; + c_row[n + 3] = s3; + } + + for (; n < batch_n_y; n++) { + const int8_t* b_row = batch_y_p + n * d; + int32x4_t acc = vdupq_n_s32(0); + size_t k = 0; + for (; k < K16; k += 16) + acc = vdotq_s32(acc, vld1q_s8(a_row + k), vld1q_s8(b_row + k)); + int32_t sum = vaddvq_s32(acc); + for (; k < partial_d_; k++) + sum += static_cast(a_row[k]) * static_cast(b_row[k]); + c_row[n] = sum; + } + } +} +#endif + +/** + * @brief Performs matrix multiplication via BLAS sgemm: C = X * Y^T + * + * Pure sgemm call with partial-dimension support via the k parameter. + * Both X (batch_n_x × d) and Y (batch_n_y × d) are row-major; sgemm handles + * the transpose by using the leading dimension as stride. + */ +inline void BlasMatrixMultiplication( + const float* SKM_RESTRICT batch_x_p, + const float* SKM_RESTRICT batch_y_p, + const size_t batch_n_x, + const size_t batch_n_y, + const size_t d, + const size_t partial_d, + float* SKM_RESTRICT tmp_distances_buf +) { + const char trans_a = 'T'; + const char trans_b = 'N'; + + int m = static_cast(batch_n_y); + int n = static_cast(batch_n_x); + int k = static_cast(partial_d > 0 && partial_d < d ? partial_d : d); + float alpha = 1.0f; + float beta = 0.0f; + int lda = static_cast(d); + int ldb = static_cast(d); + int ldc = static_cast(batch_n_y); + + sgemm_( + &trans_a, + &trans_b, + &m, + &n, + &k, + &alpha, + batch_y_p, + &lda, + batch_x_p, + &ldb, + &beta, + tmp_distances_buf, + &ldc + ); +} + +/** + * @brief Generic matrix multiplication dispatcher: C = X * Y^T + * + * Routes to XNNPACK (if USE_XNNPACK env var is set) or falls back to BLAS sgemm. + * Supports partial-dimension computation via partial_d. + */ +inline void MatrixMultiplication( + const float* SKM_RESTRICT batch_x_p, + const float* SKM_RESTRICT batch_y_p, + const size_t batch_n_x, + const size_t batch_n_y, + const size_t d, + const size_t partial_d, + float* SKM_RESTRICT tmp_distances_buf +) { + if (UseXnnpack() && InitXnnpack()) { + XnnpackMatmulF32( + batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf + ); + return; + } + BlasMatrixMultiplication( + batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf + ); +} + +} // namespace matmul +} // namespace skmeans From f00dd076daae88dd4040aaeeff3fcd9b43177de7 Mon Sep 17 00:00:00 2001 From: Joseph Hwang Date: Sun, 22 Mar 2026 23:14:48 -0700 Subject: [PATCH 3/5] XNNpack --- .../end_to_end/end_to_end_superkmeans.cpp | 6 +- .../superkmeans/distance_computers/matmul.h | 148 +++++++----------- include/superkmeans/superkmeans.h | 17 +- 3 files changed, 62 insertions(+), 109 deletions(-) diff --git a/benchmarks/end_to_end/end_to_end_superkmeans.cpp b/benchmarks/end_to_end/end_to_end_superkmeans.cpp index 9b50099..59361de 100644 --- a/benchmarks/end_to_end/end_to_end_superkmeans.cpp +++ b/benchmarks/end_to_end/end_to_end_superkmeans.cpp @@ -67,15 +67,15 @@ int main(int argc, char* argv[]) { skmeans::SuperKMeansConfig config; config.iters = n_iters; - config.verbose = true; + config.verbose = false; config.n_threads = THREADS; config.objective_k = 100; config.ann_explore_fraction = 0.01f; config.unrotate_centroids = true; config.early_termination = false; config.sampling_fraction = sampling_fraction; - // TODO: For joseph experiment - config.use_blas_only = false; + // Used for this set of benchmarks + config.use_blas_only = true; auto is_angular = std::find( bench_utils::ANGULAR_DATASETS.begin(), bench_utils::ANGULAR_DATASETS.end(), dataset diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h index 4aa0bdc..742acdd 100644 --- a/include/superkmeans/distance_computers/matmul.h +++ b/include/superkmeans/distance_computers/matmul.h @@ -36,14 +36,13 @@ inline bool InitXnnpack() { } /** - * @brief Packs kernel rows into a contiguous partial-dimension buffer for XNNPACK. + * XNNPack does not support performing matrix multiplication on an arbitrary subset of dimensions + * Based on our benchmarks - we believe it is sufficiently performant to just re-copy the subset of dimensions that we care about * * When partial_d < d, XNNPACK requires the kernel to be contiguous N × partial_d. * This copies the first partial_d elements from each of the n_rows rows (stride d) * into a thread-local contiguous buffer and returns a pointer to it. - * When partial_d == d, returns the original pointer with no copy. - * - * @return Pointer to contiguous kernel data and the effective dimension count. + * If partial_d == d, returns the original pointer with no copy. */ template inline std::pair PackKernelForPartialD( @@ -53,6 +52,7 @@ inline std::pair PackKernelForPartialD( if (partial_d_ == d) { return {kernel, d}; } + // Note: this file uses threadlocals to avoid repeated allocations thread_local std::vector buf; buf.resize(n_rows * partial_d_ * sizeof(T)); auto* dst = reinterpret_cast(buf.data()); @@ -63,14 +63,10 @@ inline std::pair PackKernelForPartialD( } /** - * @brief Performs matrix multiplication using XNNPACK's fully-connected operator: C = X * Y^T - * * Uses XNNPACK's NC F32 fully-connected operator where X is the "input" (batch_n_x × d) * and Y is the "kernel" (batch_n_y × d). Supports partial-dimension computation via partial_d: * the input (X) uses XNNPACK's input_stride to skip trailing dimensions without copying, * while the kernel (Y) rows are packed into a contiguous buffer when partial_d < d. - * - * Thread-local pthreadpool with g_n_threads threads is used. Not safe to call from OMP parallel regions. */ inline void XnnpackMatmulF32( const float* SKM_RESTRICT batch_x_p, @@ -86,6 +82,8 @@ inline void XnnpackMatmulF32( size_t partial_d_; { SKM_PROFILE_SCOPE("xnnpack_f32/pack"); + // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride), + // but requires Y to be contiguous N × partial_d. std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d); } @@ -93,16 +91,37 @@ inline void XnnpackMatmulF32( SKM_PROFILE_SCOPE("xnnpack_f32/matmul"); thread_local pthreadpool_t tp = pthreadpool_create(omp_get_max_threads()); xnn_operator_t op = nullptr; + // Create the fully-connected operator. XNNPACK's API is designed for neural network layers, + // so we have to pass several parameters (bias, clamping, flags) that we don't use. xnn_create_fully_connected_nc_f32( - partial_d_, batch_n_y, d, batch_n_y, - kernel_p, nullptr, + // input_channels: number of dimensions to dot-product over + partial_d_, + // output_channels: number of rows in Y (columns in the output matrix) + batch_n_y, + // input_stride: full row stride in X, even if we only use partial_d_ dims + d, + // output_stride + batch_n_y, + // the Y matrix + kernel_p, + // Do not set a "bias" since we want raw dot products. The "bias" is a vector that is added to the output. + nullptr, + // no output clamping (we should be able to use the full range of f32 values) -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0, nullptr, &op + // Don't set any special flags + 0, + // Do not set a cache; we recreate the operator each call + nullptr, + &op ); + // Allocate internal buffers for batch_n_x input rows xnn_reshape_fully_connected_nc_f32(op, batch_n_x, tp); + // Bind X as input and tmp_distances_buf as output xnn_setup_fully_connected_nc_f32(op, batch_x_p, tmp_distances_buf); + // Execute the matmul using the thread pool xnn_run_operator(op, tp); + // Free the operator (the thread pool is reused across calls) xnn_delete_operator(op); } } @@ -130,6 +149,8 @@ inline void XnnpackMatmulI8F32( size_t partial_d_; { SKM_PROFILE_SCOPE("xnnpack_i8f32/pack"); + // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride), + // but requires Y to be contiguous N × partial_d. std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d); } @@ -144,101 +165,45 @@ inline void XnnpackMatmulI8F32( thread_local pthreadpool_t tp_i8 = pthreadpool_create(omp_get_max_threads()); xnn_operator_t op = nullptr; xnn_create_fully_connected_nc_qd8_f32_qc8w( - partial_d_, batch_n_y, d, batch_n_y, - kernel_scale.data(), kernel_p, nullptr, + // input_channels: number of dimensions to dot-product over + partial_d_, + // output_channels: number of rows in Y (columns in the output matrix) + batch_n_y, + // input_stride: full row stride in X, even if we only use partial_d_ dims + d, + // output_stride + batch_n_y, + // per-row scale for Y; set to 1.0 for identity quantization + kernel_scale.data(), + // the Y matrix (XNNPACK calls this "kernel") + kernel_p, + // Do not set a "bias" since we want raw dot products. The "bias" is a vector that is added to the output. + nullptr, + // no output clamping (we should be able to use the full range of f32 values) -std::numeric_limits::infinity(), std::numeric_limits::infinity(), - 0, nullptr, &op + // Don't set any special flags + 0, + // Do not set a cache; we recreate the operator each call + nullptr, + &op ); + // Allocate internal buffers for batch_n_x input rows; also reports workspace size needed size_t ws_size = 0; xnn_reshape_fully_connected_nc_qd8_f32_qc8w(op, batch_n_x, &ws_size, tp_i8); thread_local std::vector ws; ws.resize(ws_size); + // Bind X as input, tmp_distances_buf as output, plus workspace and quantization params xnn_setup_fully_connected_nc_qd8_f32_qc8w( op, batch_x_p, tmp_distances_buf, ws.data(), qparams.data() ); + // Execute the matmul using the thread pool xnn_run_operator(op, tp_i8); + // Free the operator (the thread pool is reused across calls) xnn_delete_operator(op); } } -#if defined(__aarch64__) -/** - * @brief Performs int8 matrix multiplication using NEON SDOT: C = X * Y^T - * - * Hand-written kernel using vdotq_s32 (SDOT) instructions for int8 dot products. - * Both X and Y are row-major with stride d; only the first partial_d dimensions are used. - * Output is int32 accumulation of the dot products. - */ -inline void NeonSdotMatmulI8( - const int8_t* SKM_RESTRICT batch_x_p, - const int8_t* SKM_RESTRICT batch_y_p, - const size_t batch_n_x, - const size_t batch_n_y, - const size_t d, - const size_t partial_d, - int32_t* SKM_RESTRICT tmp_distances_buf -) { - const size_t partial_d_ = (partial_d > 0 && partial_d < d) ? partial_d : d; - const size_t K16 = partial_d_ & ~(size_t)15; - -#pragma omp parallel for schedule(static) num_threads(g_n_threads) - for (size_t m = 0; m < batch_n_x; m++) { - const int8_t* a_row = batch_x_p + m * d; - int32_t* c_row = tmp_distances_buf + m * batch_n_y; - - size_t n = 0; - for (; n + 4 <= batch_n_y; n += 4) { - const int8_t* b0 = batch_y_p + (n + 0) * d; - const int8_t* b1 = batch_y_p + (n + 1) * d; - const int8_t* b2 = batch_y_p + (n + 2) * d; - const int8_t* b3 = batch_y_p + (n + 3) * d; - - int32x4_t acc0 = vdupq_n_s32(0); - int32x4_t acc1 = vdupq_n_s32(0); - int32x4_t acc2 = vdupq_n_s32(0); - int32x4_t acc3 = vdupq_n_s32(0); - - for (size_t k = 0; k < K16; k += 16) { - int8x16_t va = vld1q_s8(a_row + k); - acc0 = vdotq_s32(acc0, va, vld1q_s8(b0 + k)); - acc1 = vdotq_s32(acc1, va, vld1q_s8(b1 + k)); - acc2 = vdotq_s32(acc2, va, vld1q_s8(b2 + k)); - acc3 = vdotq_s32(acc3, va, vld1q_s8(b3 + k)); - } - - int32_t s0 = vaddvq_s32(acc0), s1 = vaddvq_s32(acc1); - int32_t s2 = vaddvq_s32(acc2), s3 = vaddvq_s32(acc3); - - for (size_t k = K16; k < partial_d_; k++) { - int32_t a_val = a_row[k]; - s0 += a_val * static_cast(b0[k]); - s1 += a_val * static_cast(b1[k]); - s2 += a_val * static_cast(b2[k]); - s3 += a_val * static_cast(b3[k]); - } - - c_row[n + 0] = s0; - c_row[n + 1] = s1; - c_row[n + 2] = s2; - c_row[n + 3] = s3; - } - - for (; n < batch_n_y; n++) { - const int8_t* b_row = batch_y_p + n * d; - int32x4_t acc = vdupq_n_s32(0); - size_t k = 0; - for (; k < K16; k += 16) - acc = vdotq_s32(acc, vld1q_s8(a_row + k), vld1q_s8(b_row + k)); - int32_t sum = vaddvq_s32(acc); - for (; k < partial_d_; k++) - sum += static_cast(a_row[k]) * static_cast(b_row[k]); - c_row[n] = sum; - } - } -} -#endif - /** * @brief Performs matrix multiplication via BLAS sgemm: C = X * Y^T * @@ -299,6 +264,7 @@ inline void MatrixMultiplication( const size_t partial_d, float* SKM_RESTRICT tmp_distances_buf ) { + // While we do support XnnpackMatmulF32 - we don't use it in any e2e benchmarks as of now. if (UseXnnpack() && InitXnnpack()) { XnnpackMatmulF32( batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, partial_d, tmp_distances_buf diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h index b0e7faf..9e9d3a5 100644 --- a/include/superkmeans/superkmeans.h +++ b/include/superkmeans/superkmeans.h @@ -948,19 +948,15 @@ class SuperKMeans { size_t& iter_idx, const bool is_first_iter, std::vector& target_stats, - const bool run_quantized_assignment = false, + const bool run_quantized_assignment = true, const bool run_f32_assignment = true, - const bool use_quantized_assignment_for_update = false + const bool use_quantized_assignment_for_update = true ) { - std::cout << "Training centroids iteration=" << iter_idx << std::endl; - if (!is_first_iter) { std::swap(horizontal_centroids, prev_centroids); } if constexpr (GEMM_ONLY) { - std::cout << "Running GEMM loop" << std::endl; - // f32 assignment path if (run_f32_assignment) { SKM_PROFILE_SCOPE("assign_f32"); @@ -993,15 +989,6 @@ class SuperKMeans { ++match; continue; } - if (mismatches_printed < 10) { - std::cout << " mismatch[" << mismatches_printed << "] sample=" << s - << " i8_assign=" << i8_assignments[s] - << " f32_assign=" << assignments[s] - << " i8_dist=" << i8_distances[s] - << " f32_dist=" << distances[s] - << " delta=" << (i8_distances[s] - distances[s]) << std::endl; - ++mismatches_printed; - } } std::cout << "i8 vs f32 assignment agreement: " << match << " / " << n_samples << " (" << (100.0 * match / n_samples) << "%)" << std::endl; From 4a5cb30385d243ef627383a619616403a053c636 Mon Sep 17 00:00:00 2001 From: Joseph Hwang Date: Sun, 22 Mar 2026 23:25:40 -0700 Subject: [PATCH 4/5] Other changes --- .../distance_computers/batch_computers.h | 13 +++++-------- include/superkmeans/distance_computers/matmul.h | 9 ++------- include/superkmeans/superkmeans.h | 4 ++-- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/include/superkmeans/distance_computers/batch_computers.h b/include/superkmeans/distance_computers/batch_computers.h index 2d96ab0..160a76a 100644 --- a/include/superkmeans/distance_computers/batch_computers.h +++ b/include/superkmeans/distance_computers/batch_computers.h @@ -73,7 +73,7 @@ inline void FindKNearestNeighborsI8( ) { using MatrixR = Eigen::Matrix; matmul::InitXnnpack(); - SKM_PROFILE_SCOPE("blas_i8_knn"); + SKM_PROFILE_SCOPE("i8_knn"); std::fill_n(out_distances, n_x * k, std::numeric_limits::max()); std::fill_n(out_knn, n_x * k, static_cast(-1)); @@ -89,15 +89,12 @@ inline void FindKNearestNeighborsI8( if (j + Y_BATCH_SIZE > n_y) { batch_n_y = n_y - j; } - { - SKM_PROFILE_SCOPE("blas_i8_knn/gemm"); - matmul::XnnpackMatmulI8F32( - batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf - ); - } + matmul::XnnpackMatmulI8F32( + batch_x_p, batch_y_p, batch_n_x, batch_n_y, d, 0, tmp_distances_buf + ); Eigen::Map distances_matrix(tmp_distances_buf, batch_n_x, batch_n_y); - SKM_PROFILE_SCOPE("blas_i8_knn/top_k"); + SKM_PROFILE_SCOPE("i8_knn/top_k"); #pragma omp parallel for num_threads(g_n_threads) for (size_t r = 0; r < batch_n_x; ++r) { const auto i_idx = i + r; diff --git a/include/superkmeans/distance_computers/matmul.h b/include/superkmeans/distance_computers/matmul.h index 742acdd..fd6f623 100644 --- a/include/superkmeans/distance_computers/matmul.h +++ b/include/superkmeans/distance_computers/matmul.h @@ -15,10 +15,6 @@ #include #include -#if defined(__aarch64__) -#include -#endif - // Eigen already declares sgemm_, so we don't need to redeclare it extern "C" {} @@ -144,18 +140,17 @@ inline void XnnpackMatmulI8F32( const size_t partial_d, float* SKM_RESTRICT tmp_distances_buf ) { - SKM_PROFILE_SCOPE("xnnpack_i8f32"); const int8_t* kernel_p; size_t partial_d_; { - SKM_PROFILE_SCOPE("xnnpack_i8f32/pack"); + SKM_PROFILE_SCOPE("i8_knn/pack"); // Only Y needs repacking: XNNPACK supports strided access for X (via input_stride), // but requires Y to be contiguous N × partial_d. std::tie(kernel_p, partial_d_) = PackKernelForPartialD(batch_y_p, batch_n_y, d, partial_d); } { - SKM_PROFILE_SCOPE("xnnpack_i8f32/matmul"); + SKM_PROFILE_SCOPE("i8_knn/matmul"); thread_local std::vector kernel_scale; kernel_scale.assign(batch_n_y, 1.0f); diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h index 9e9d3a5..1eea344 100644 --- a/include/superkmeans/superkmeans.h +++ b/include/superkmeans/superkmeans.h @@ -699,12 +699,12 @@ class SuperKMeans { GetQuantizedL2NormsRowMajor(quantized_centroids_buf.data(), n_clusters, quantized_centroid_norms.get()); } - const size_t i8_k = 2; + const size_t i8_k = 1; std::vector i8_knn(n_samples * i8_k); std::vector i8_knn_distances(n_samples * i8_k); std::vector i8_tmp_distances_buf(X_BATCH_SIZE * Y_BATCH_SIZE); { - SKM_PROFILE_SCOPE("assign_i8/knn"); + SKM_PROFILE_SCOPE("assign_i8/i8_knn"); FindKNearestNeighborsI8( quantized_data.data(), quantized_centroids_buf.data(), From 50865fcc4c002d208321c666f9bb0d636ba4b53b Mon Sep 17 00:00:00 2001 From: Joseph Hwang Date: Sun, 22 Mar 2026 23:50:07 -0700 Subject: [PATCH 5/5] Ok --- f32_benchmark_output.txt | 117 ++++++++++++++++ f32_xnnpack_benchmark_output.txt | 120 ++++++++++++++++ ...nnpack_10_knn_rescore_benchmark_output.txt | 132 ++++++++++++++++++ i8_xnnpack_benchmark_output.txt | 131 +++++++++++++++++ include/superkmeans/superkmeans.h | 5 +- 5 files changed, 502 insertions(+), 3 deletions(-) create mode 100644 f32_benchmark_output.txt create mode 100644 f32_xnnpack_benchmark_output.txt create mode 100644 i8_xnnpack_10_knn_rescore_benchmark_output.txt create mode 100644 i8_xnnpack_benchmark_output.txt diff --git a/f32_benchmark_output.txt b/f32_benchmark_output.txt new file mode 100644 index 0000000..ba38eab --- /dev/null +++ b/f32_benchmark_output.txt @@ -0,0 +1,117 @@ +########################################## +# DATASET: cohere2m +########################################## + +========================================== +Running benchmarks for cohere2m... +========================================== + +---------------------------------------- +1/3: SuperKMeans +---------------------------------------- +=== Running algorithm: superkmeans === +Dataset: cohere2m (n=2000000, d=1024) +n_clusters=5656 n_iters=5 sampling_fraction=1 +Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled) +Front dimensions (d') = 128 +Trailing dimensions (d'') = 768 +Sampling data... +Using 2000000 vectors +Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.92 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 2/5 | Objective: 1.13074e+06 | Objective improvement: 0.405949 | Shift: 80.9742 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 3/5 | Objective: 1.09574e+06 | Objective improvement: 0.0309505 | Shift: 30.4903 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 4/5 | Objective: 1.08237e+06 | Objective improvement: 0.012203 | Shift: 16.1337 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 5/5 | Objective: 1.07518e+06 | Objective improvement: 0.00664073 | Shift: 10.0611 | Split: 0 | Recall: 0 [BLAS-only] + + +========== PROFILER RESULTS ========== +assign_f32 111.204s (90.648%) [5 calls] +rotator 5.507s (4.489%) [3 calls] +update_centroids 3.041s (2.479%) [5 calls] +norms_calc 2.724s (2.220%) [7 calls] +fill 0.078s (0.063%) [5 calls] +generating_centroids 0.077s (0.062%) +consolidate 0.029s (0.024%) [5 calls] + - pdxify 0.034s (0.028%) [6 calls] + - splitting 0.002s (0.001%) [5 calls] + - normalize 0.000s (0.000%) [5 calls] +unrotator 0.011s (0.009%) +compute_cost 0.003s (0.002%) [5 calls] +shift 0.003s (0.002%) [5 calls] +allocator 0.000s (0.000%) +------------------------------------------- +TOTAL 122.677s +=========================================== + +Training completed in 122746.855 ms +Actual iterations: 5 (requested: 5) +Final objective: 1075182.750 + +--- Computing Recall --- +Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json +Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin +Using 1000 queries (loaded 1000 from ground truth) +Cluster size stats: mean=353.607, gmean=308.812, std=173.238, CV=0.490, min=15, max=1517 + +--- Recall@10 --- +Recall@ 5 ( 0.10% centroids, 1969 avg vectors): 0.6299 ± 0.3537 +Recall@ 11 ( 0.20% centroids, 4202 avg vectors): 0.7148 ± 0.3268 +Recall@ 16 ( 0.30% centroids, 6050 avg vectors): 0.7578 ± 0.3046 +Recall@ 22 ( 0.40% centroids, 8230 avg vectors): 0.7848 ± 0.2895 +Recall@ 28 ( 0.50% centroids, 10414 avg vectors): 0.8060 ± 0.2763 +Recall@ 33 ( 0.60% centroids, 12214 avg vectors): 0.8204 ± 0.2652 +Recall@ 39 ( 0.70% centroids, 14371 avg vectors): 0.8350 ± 0.2550 +Recall@ 45 ( 0.80% centroids, 16532 avg vectors): 0.8434 ± 0.2492 +Recall@ 50 ( 0.90% centroids, 18323 avg vectors): 0.8515 ± 0.2423 +Recall@ 56 ( 1.00% centroids, 20488 avg vectors): 0.8587 ± 0.2370 +Recall@ 70 ( 1.25% centroids, 25527 avg vectors): 0.8703 ± 0.2267 +Recall@ 84 ( 1.50% centroids, 30547 avg vectors): 0.8799 ± 0.2161 +Recall@ 98 ( 1.75% centroids, 35559 avg vectors): 0.8890 ± 0.2079 +Recall@ 113 ( 2.00% centroids, 40918 avg vectors): 0.8961 ± 0.1987 +Recall@ 127 ( 2.25% centroids, 45937 avg vectors): 0.9023 ± 0.1914 +Recall@ 141 ( 2.50% centroids, 50928 avg vectors): 0.9065 ± 0.1872 +Recall@ 155 ( 2.75% centroids, 55956 avg vectors): 0.9114 ± 0.1808 +Recall@ 169 ( 3.00% centroids, 60934 avg vectors): 0.9149 ± 0.1764 +Recall@ 183 ( 3.25% centroids, 65893 avg vectors): 0.9177 ± 0.1731 +Recall@ 197 ( 3.50% centroids, 70869 avg vectors): 0.9203 ± 0.1698 +Recall@ 212 ( 3.75% centroids, 76187 avg vectors): 0.9243 ± 0.1649 +Recall@ 226 ( 4.00% centroids, 81156 avg vectors): 0.9282 ± 0.1576 +Recall@ 240 ( 4.25% centroids, 86139 avg vectors): 0.9312 ± 0.1535 +Recall@ 254 ( 4.50% centroids, 91087 avg vectors): 0.9349 ± 0.1497 +Recall@ 268 ( 4.75% centroids, 96056 avg vectors): 0.9378 ± 0.1470 +Recall@ 282 ( 5.00% centroids, 101024 avg vectors): 0.9389 ± 0.1459 +Recall@ 565 (10.00% centroids, 201269 avg vectors): 0.9642 ± 0.1076 + +--- Recall@100 --- +Recall@ 5 ( 0.10% centroids, 1969 avg vectors): 0.5628 ± 0.2819 +Recall@ 11 ( 0.20% centroids, 4202 avg vectors): 0.6625 ± 0.2638 +Recall@ 16 ( 0.30% centroids, 6050 avg vectors): 0.7089 ± 0.2510 +Recall@ 22 ( 0.40% centroids, 8230 avg vectors): 0.7443 ± 0.2367 +Recall@ 28 ( 0.50% centroids, 10414 avg vectors): 0.7661 ± 0.2284 +Recall@ 33 ( 0.60% centroids, 12214 avg vectors): 0.7812 ± 0.2212 +Recall@ 39 ( 0.70% centroids, 14371 avg vectors): 0.7967 ± 0.2135 +Recall@ 45 ( 0.80% centroids, 16532 avg vectors): 0.8084 ± 0.2068 +Recall@ 50 ( 0.90% centroids, 18323 avg vectors): 0.8173 ± 0.2009 +Recall@ 56 ( 1.00% centroids, 20488 avg vectors): 0.8270 ± 0.1961 +Recall@ 70 ( 1.25% centroids, 25527 avg vectors): 0.8421 ± 0.1874 +Recall@ 84 ( 1.50% centroids, 30547 avg vectors): 0.8553 ± 0.1782 +Recall@ 98 ( 1.75% centroids, 35559 avg vectors): 0.8660 ± 0.1708 +Recall@ 113 ( 2.00% centroids, 40918 avg vectors): 0.8753 ± 0.1637 +Recall@ 127 ( 2.25% centroids, 45937 avg vectors): 0.8830 ± 0.1561 +Recall@ 141 ( 2.50% centroids, 50928 avg vectors): 0.8893 ± 0.1514 +Recall@ 155 ( 2.75% centroids, 55956 avg vectors): 0.8955 ± 0.1450 +Recall@ 169 ( 3.00% centroids, 60934 avg vectors): 0.9002 ± 0.1402 +Recall@ 183 ( 3.25% centroids, 65893 avg vectors): 0.9044 ± 0.1370 +Recall@ 197 ( 3.50% centroids, 70869 avg vectors): 0.9082 ± 0.1335 +Recall@ 212 ( 3.75% centroids, 76187 avg vectors): 0.9127 ± 0.1288 +Recall@ 226 ( 4.00% centroids, 81156 avg vectors): 0.9167 ± 0.1250 +Recall@ 240 ( 4.25% centroids, 86139 avg vectors): 0.9194 ± 0.1227 +Recall@ 254 ( 4.50% centroids, 91087 avg vectors): 0.9225 ± 0.1198 +Recall@ 268 ( 4.75% centroids, 96056 avg vectors): 0.9259 ± 0.1161 +Recall@ 282 ( 5.00% centroids, 101024 avg vectors): 0.9280 ± 0.1141 +Recall@ 565 (10.00% centroids, 201269 avg vectors): 0.9572 ± 0.0804 +Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv diff --git a/f32_xnnpack_benchmark_output.txt b/f32_xnnpack_benchmark_output.txt new file mode 100644 index 0000000..1e774f9 --- /dev/null +++ b/f32_xnnpack_benchmark_output.txt @@ -0,0 +1,120 @@ +########################################## +# DATASET: cohere2m +########################################## + +========================================== +Running benchmarks for cohere2m... +========================================== + +---------------------------------------- +1/3: SuperKMeans +---------------------------------------- +=== Running algorithm: superkmeans === +Dataset: cohere2m (n=2000000, d=1024) +n_clusters=5656 n_iters=5 sampling_fraction=1 +Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled) +Front dimensions (d') = 128 +Trailing dimensions (d'') = 768 +Sampling data... +Using 2000000 vectors +Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.92 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 2/5 | Objective: 1.13074e+06 | Objective improvement: 0.405949 | Shift: 80.9742 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 3/5 | Objective: 1.09574e+06 | Objective improvement: 0.0309505 | Shift: 30.4903 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 4/5 | Objective: 1.08237e+06 | Objective improvement: 0.012203 | Shift: 16.1337 | Split: 0 | Recall: 0 [BLAS-only] + +Iteration 5/5 | Objective: 1.07518e+06 | Objective improvement: 0.00664073 | Shift: 10.0611 | Split: 0 | Recall: 0 [BLAS-only] + + +========== PROFILER RESULTS ========== +assign_f32 266.597s (50.005%) [5 calls] +xnnpack_f32 250.760s (47.034%) [110451 calls] + - matmul 246.981s (46.325%) [110268 calls] + - pack 0.050s (0.009%) [116917 calls] +rotator 7.984s (1.498%) [3 calls] +update_centroids 5.001s (0.938%) [5 calls] +norms_calc 2.627s (0.493%) [7 calls] +generating_centroids 0.094s (0.018%) +consolidate 0.042s (0.008%) [5 calls] + - pdxify 0.049s (0.009%) [6 calls] + - splitting 0.002s (0.000%) [5 calls] + - normalize 0.000s (0.000%) [5 calls] +fill 0.021s (0.004%) [5 calls] +unrotator 0.013s (0.002%) +compute_cost 0.004s (0.001%) [5 calls] +shift 0.003s (0.001%) [5 calls] +allocator 0.000s (0.000%) +------------------------------------------- +TOTAL 533.147s +=========================================== + +Training completed in 282555.208 ms +Actual iterations: 5 (requested: 5) +Final objective: 1075182.750 + +--- Computing Recall --- +Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json +Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin +Using 1000 queries (loaded 1000 from ground truth) +Cluster size stats: mean=353.607, gmean=308.812, std=173.238, CV=0.490, min=15, max=1517 + +--- Recall@10 --- +Recall@ 5 ( 0.10% centroids, 1969 avg vectors): 0.6299 ± 0.3537 +Recall@ 11 ( 0.20% centroids, 4202 avg vectors): 0.7148 ± 0.3268 +Recall@ 16 ( 0.30% centroids, 6050 avg vectors): 0.7578 ± 0.3046 +Recall@ 22 ( 0.40% centroids, 8230 avg vectors): 0.7848 ± 0.2895 +Recall@ 28 ( 0.50% centroids, 10414 avg vectors): 0.8060 ± 0.2763 +Recall@ 33 ( 0.60% centroids, 12214 avg vectors): 0.8204 ± 0.2652 +Recall@ 39 ( 0.70% centroids, 14371 avg vectors): 0.8350 ± 0.2550 +Recall@ 45 ( 0.80% centroids, 16532 avg vectors): 0.8434 ± 0.2492 +Recall@ 50 ( 0.90% centroids, 18323 avg vectors): 0.8515 ± 0.2423 +Recall@ 56 ( 1.00% centroids, 20488 avg vectors): 0.8587 ± 0.2370 +Recall@ 70 ( 1.25% centroids, 25527 avg vectors): 0.8703 ± 0.2267 +Recall@ 84 ( 1.50% centroids, 30547 avg vectors): 0.8799 ± 0.2161 +Recall@ 98 ( 1.75% centroids, 35559 avg vectors): 0.8890 ± 0.2079 +Recall@ 113 ( 2.00% centroids, 40918 avg vectors): 0.8961 ± 0.1987 +Recall@ 127 ( 2.25% centroids, 45937 avg vectors): 0.9023 ± 0.1914 +Recall@ 141 ( 2.50% centroids, 50928 avg vectors): 0.9065 ± 0.1872 +Recall@ 155 ( 2.75% centroids, 55956 avg vectors): 0.9114 ± 0.1808 +Recall@ 169 ( 3.00% centroids, 60934 avg vectors): 0.9149 ± 0.1764 +Recall@ 183 ( 3.25% centroids, 65893 avg vectors): 0.9177 ± 0.1731 +Recall@ 197 ( 3.50% centroids, 70869 avg vectors): 0.9203 ± 0.1698 +Recall@ 212 ( 3.75% centroids, 76187 avg vectors): 0.9243 ± 0.1649 +Recall@ 226 ( 4.00% centroids, 81156 avg vectors): 0.9282 ± 0.1576 +Recall@ 240 ( 4.25% centroids, 86139 avg vectors): 0.9312 ± 0.1535 +Recall@ 254 ( 4.50% centroids, 91087 avg vectors): 0.9349 ± 0.1497 +Recall@ 268 ( 4.75% centroids, 96056 avg vectors): 0.9378 ± 0.1470 +Recall@ 282 ( 5.00% centroids, 101024 avg vectors): 0.9389 ± 0.1459 +Recall@ 565 (10.00% centroids, 201269 avg vectors): 0.9642 ± 0.1076 + +--- Recall@100 --- +Recall@ 5 ( 0.10% centroids, 1969 avg vectors): 0.5628 ± 0.2819 +Recall@ 11 ( 0.20% centroids, 4202 avg vectors): 0.6625 ± 0.2638 +Recall@ 16 ( 0.30% centroids, 6050 avg vectors): 0.7089 ± 0.2510 +Recall@ 22 ( 0.40% centroids, 8230 avg vectors): 0.7443 ± 0.2367 +Recall@ 28 ( 0.50% centroids, 10414 avg vectors): 0.7661 ± 0.2284 +Recall@ 33 ( 0.60% centroids, 12214 avg vectors): 0.7812 ± 0.2212 +Recall@ 39 ( 0.70% centroids, 14371 avg vectors): 0.7967 ± 0.2135 +Recall@ 45 ( 0.80% centroids, 16532 avg vectors): 0.8084 ± 0.2068 +Recall@ 50 ( 0.90% centroids, 18323 avg vectors): 0.8173 ± 0.2009 +Recall@ 56 ( 1.00% centroids, 20488 avg vectors): 0.8270 ± 0.1961 +Recall@ 70 ( 1.25% centroids, 25527 avg vectors): 0.8421 ± 0.1874 +Recall@ 84 ( 1.50% centroids, 30547 avg vectors): 0.8553 ± 0.1782 +Recall@ 98 ( 1.75% centroids, 35559 avg vectors): 0.8660 ± 0.1708 +Recall@ 113 ( 2.00% centroids, 40918 avg vectors): 0.8753 ± 0.1637 +Recall@ 127 ( 2.25% centroids, 45937 avg vectors): 0.8830 ± 0.1561 +Recall@ 141 ( 2.50% centroids, 50928 avg vectors): 0.8893 ± 0.1514 +Recall@ 155 ( 2.75% centroids, 55956 avg vectors): 0.8955 ± 0.1450 +Recall@ 169 ( 3.00% centroids, 60934 avg vectors): 0.9002 ± 0.1402 +Recall@ 183 ( 3.25% centroids, 65893 avg vectors): 0.9044 ± 0.1370 +Recall@ 197 ( 3.50% centroids, 70869 avg vectors): 0.9082 ± 0.1335 +Recall@ 212 ( 3.75% centroids, 76187 avg vectors): 0.9127 ± 0.1288 +Recall@ 226 ( 4.00% centroids, 81156 avg vectors): 0.9167 ± 0.1250 +Recall@ 240 ( 4.25% centroids, 86139 avg vectors): 0.9194 ± 0.1227 +Recall@ 254 ( 4.50% centroids, 91087 avg vectors): 0.9225 ± 0.1198 +Recall@ 268 ( 4.75% centroids, 96056 avg vectors): 0.9259 ± 0.1161 +Recall@ 282 ( 5.00% centroids, 101024 avg vectors): 0.9280 ± 0.1141 +Recall@ 565 (10.00% centroids, 201269 avg vectors): 0.9572 ± 0.0804 +Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv \ No newline at end of file diff --git a/i8_xnnpack_10_knn_rescore_benchmark_output.txt b/i8_xnnpack_10_knn_rescore_benchmark_output.txt new file mode 100644 index 0000000..d9fbe6d --- /dev/null +++ b/i8_xnnpack_10_knn_rescore_benchmark_output.txt @@ -0,0 +1,132 @@ +########################################## +# DATASET: cohere2m +########################################## + +========================================== +Running benchmarks for cohere2m... +========================================== + +---------------------------------------- +1/3: SuperKMeans +---------------------------------------- +=== Running algorithm: superkmeans === +Dataset: cohere2m (n=2000000, d=1024) +n_clusters=5656 n_iters=5 sampling_fraction=1 +Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled) +Front dimensions (d') = 128 +Trailing dimensions (d'') = 768 +Sampling data... +Using 2000000 vectors +i8 vs f32 assignment agreement: 1999953 / 2000000 (99.9976%) +Iteration 1/5 | Objective: 1.90344e+06 | Objective improvement: 0 | Shift: 2135.94 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1991285 / 2000000 (99.5643%) +Iteration 2/5 | Objective: 1.13085e+06 | Objective improvement: 0.405892 | Shift: 81.4288 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1997058 / 2000000 (99.8529%) +Iteration 3/5 | Objective: 1.09575e+06 | Objective improvement: 0.0310347 | Shift: 30.5562 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1997574 / 2000000 (99.8787%) +Iteration 4/5 | Objective: 1.08238e+06 | Objective improvement: 0.0122005 | Shift: 16.2518 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1997675 / 2000000 (99.8838%) +Iteration 5/5 | Objective: 1.0752e+06 | Objective improvement: 0.00663465 | Shift: 9.95498 | Split: 0 | Recall: 0 [BLAS-only] + + +========== PROFILER RESULTS ========== +assign_f32 111.674s (45.204%) [5 calls] +assign_i8 65.600s (26.554%) [5 calls] + - i8_knn 60.490s (24.485%) [5 calls] + - rerank 3.569s (1.445%) [5 calls] +i8_knn 60.490s (24.485%) [5 calls] + - matmul 52.337s (21.185%) [735 calls] + - top_k 8.099s (3.278%) [735 calls] + - pack 0.000s (0.000%) [735 calls] +rotator 5.324s (2.155%) [3 calls] +norms_calc 1.756s (0.711%) [13 calls] +update_centroids 1.292s (0.523%) [5 calls] +quantize_data 0.717s (0.290%) +generating_centroids 0.064s (0.026%) +consolidate 0.048s (0.019%) [5 calls] + - pdxify 0.051s (0.021%) [6 calls] + - splitting 0.002s (0.001%) [5 calls] + - normalize 0.000s (0.000%) [5 calls] +fill 0.038s (0.015%) [10 calls] +quantize_centroids 0.018s (0.007%) [5 calls] +unrotator 0.016s (0.006%) +compare_assignments 0.007s (0.003%) [5 calls] +shift 0.003s (0.001%) [5 calls] +compute_cost 0.001s (0.000%) [5 calls] +allocator 0.000s (0.000%) +------------------------------------------- +TOTAL 247.048s +=========================================== + +Training completed in 185900.305 ms +Actual iterations: 5 (requested: 5) +Final objective: 1075201.500 + +--- Computing Recall --- +Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json +Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin +Using 1000 queries (loaded 1000 from ground truth) +Cluster size stats: mean=353.607, gmean=308.569, std=173.012, CV=0.489, min=13, max=1502 + +--- Recall@10 --- +Recall@ 5 ( 0.10% centroids, 1974 avg vectors): 0.6276 ± 0.3551 +Recall@ 11 ( 0.20% centroids, 4209 avg vectors): 0.7162 ± 0.3258 +Recall@ 16 ( 0.30% centroids, 6058 avg vectors): 0.7562 ± 0.3052 +Recall@ 22 ( 0.40% centroids, 8247 avg vectors): 0.7840 ± 0.2895 +Recall@ 28 ( 0.50% centroids, 10439 avg vectors): 0.8049 ± 0.2756 +Recall@ 33 ( 0.60% centroids, 12238 avg vectors): 0.8193 ± 0.2658 +Recall@ 39 ( 0.70% centroids, 14402 avg vectors): 0.8344 ± 0.2547 +Recall@ 45 ( 0.80% centroids, 16559 avg vectors): 0.8425 ± 0.2500 +Recall@ 50 ( 0.90% centroids, 18378 avg vectors): 0.8490 ± 0.2445 +Recall@ 56 ( 1.00% centroids, 20543 avg vectors): 0.8580 ± 0.2370 +Recall@ 70 ( 1.25% centroids, 25592 avg vectors): 0.8697 ± 0.2261 +Recall@ 84 ( 1.50% centroids, 30603 avg vectors): 0.8796 ± 0.2163 +Recall@ 98 ( 1.75% centroids, 35642 avg vectors): 0.8887 ± 0.2084 +Recall@ 113 ( 2.00% centroids, 40999 avg vectors): 0.8961 ± 0.1986 +Recall@ 127 ( 2.25% centroids, 46031 avg vectors): 0.9024 ± 0.1915 +Recall@ 141 ( 2.50% centroids, 51038 avg vectors): 0.9067 ± 0.1872 +Recall@ 155 ( 2.75% centroids, 56052 avg vectors): 0.9116 ± 0.1809 +Recall@ 169 ( 3.00% centroids, 61050 avg vectors): 0.9141 ± 0.1775 +Recall@ 183 ( 3.25% centroids, 66004 avg vectors): 0.9173 ± 0.1740 +Recall@ 197 ( 3.50% centroids, 70997 avg vectors): 0.9200 ± 0.1707 +Recall@ 212 ( 3.75% centroids, 76339 avg vectors): 0.9238 ± 0.1664 +Recall@ 226 ( 4.00% centroids, 81305 avg vectors): 0.9276 ± 0.1597 +Recall@ 240 ( 4.25% centroids, 86285 avg vectors): 0.9315 ± 0.1533 +Recall@ 254 ( 4.50% centroids, 91273 avg vectors): 0.9343 ± 0.1509 +Recall@ 268 ( 4.75% centroids, 96262 avg vectors): 0.9365 ± 0.1488 +Recall@ 282 ( 5.00% centroids, 101264 avg vectors): 0.9384 ± 0.1472 +Recall@ 565 (10.00% centroids, 201750 avg vectors): 0.9637 ± 0.1086 + +--- Recall@100 --- +Recall@ 5 ( 0.10% centroids, 1974 avg vectors): 0.5620 ± 0.2818 +Recall@ 11 ( 0.20% centroids, 4209 avg vectors): 0.6628 ± 0.2634 +Recall@ 16 ( 0.30% centroids, 6058 avg vectors): 0.7087 ± 0.2514 +Recall@ 22 ( 0.40% centroids, 8247 avg vectors): 0.7430 ± 0.2382 +Recall@ 28 ( 0.50% centroids, 10439 avg vectors): 0.7655 ± 0.2291 +Recall@ 33 ( 0.60% centroids, 12238 avg vectors): 0.7812 ± 0.2216 +Recall@ 39 ( 0.70% centroids, 14402 avg vectors): 0.7963 ± 0.2143 +Recall@ 45 ( 0.80% centroids, 16559 avg vectors): 0.8082 ± 0.2070 +Recall@ 50 ( 0.90% centroids, 18378 avg vectors): 0.8165 ± 0.2016 +Recall@ 56 ( 1.00% centroids, 20543 avg vectors): 0.8267 ± 0.1964 +Recall@ 70 ( 1.25% centroids, 25592 avg vectors): 0.8418 ± 0.1877 +Recall@ 84 ( 1.50% centroids, 30603 avg vectors): 0.8547 ± 0.1789 +Recall@ 98 ( 1.75% centroids, 35642 avg vectors): 0.8656 ± 0.1709 +Recall@ 113 ( 2.00% centroids, 40999 avg vectors): 0.8747 ± 0.1639 +Recall@ 127 ( 2.25% centroids, 46031 avg vectors): 0.8820 ± 0.1576 +Recall@ 141 ( 2.50% centroids, 51038 avg vectors): 0.8892 ± 0.1515 +Recall@ 155 ( 2.75% centroids, 56052 avg vectors): 0.8954 ± 0.1447 +Recall@ 169 ( 3.00% centroids, 61050 avg vectors): 0.8996 ± 0.1410 +Recall@ 183 ( 3.25% centroids, 66004 avg vectors): 0.9038 ± 0.1370 +Recall@ 197 ( 3.50% centroids, 70997 avg vectors): 0.9077 ± 0.1338 +Recall@ 212 ( 3.75% centroids, 76339 avg vectors): 0.9121 ± 0.1291 +Recall@ 226 ( 4.00% centroids, 81305 avg vectors): 0.9160 ± 0.1257 +Recall@ 240 ( 4.25% centroids, 86285 avg vectors): 0.9193 ± 0.1227 +Recall@ 254 ( 4.50% centroids, 91273 avg vectors): 0.9221 ± 0.1201 +Recall@ 268 ( 4.75% centroids, 96262 avg vectors): 0.9250 ± 0.1177 +Recall@ 282 ( 5.00% centroids, 101264 avg vectors): 0.9279 ± 0.1140 +Recall@ 565 (10.00% centroids, 201750 avg vectors): 0.9570 ± 0.0807 +Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv diff --git a/i8_xnnpack_benchmark_output.txt b/i8_xnnpack_benchmark_output.txt new file mode 100644 index 0000000..edeaed9 --- /dev/null +++ b/i8_xnnpack_benchmark_output.txt @@ -0,0 +1,131 @@ +########################################## +# DATASET: cohere2m +########################################## + +========================================== +Running benchmarks for cohere2m... +========================================== + +---------------------------------------- +1/3: SuperKMeans +---------------------------------------- +=== Running algorithm: superkmeans === +Dataset: cohere2m (n=2000000, d=1024) +n_clusters=5656 n_iters=5 sampling_fraction=1 +Eigen # threads: 1 (note: it will always be 1 if BLAS is enabled) +Front dimensions (d') = 128 +Trailing dimensions (d'') = 768 +Sampling data... +Using 2000000 vectors +i8 vs f32 assignment agreement: 1982225 / 2000000 (99.1112%) +Iteration 1/5 | Objective: 1.03122e+12 | Objective improvement: 0 | Shift: 2135.99 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1521795 / 2000000 (76.0897%) +Iteration 2/5 | Objective: 7.45367e+11 | Objective improvement: 0.277202 | Shift: 104.263 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1546745 / 2000000 (77.3372%) +Iteration 3/5 | Objective: 6.46113e+11 | Objective improvement: 0.133162 | Shift: 44.8106 | Split: 2 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1544260 / 2000000 (77.213%) +Iteration 4/5 | Objective: 6.25703e+11 | Objective improvement: 0.0315884 | Shift: 26.0822 | Split: 0 | Recall: 0 [BLAS-only] + +i8 vs f32 assignment agreement: 1566088 / 2000000 (78.3044%) +Iteration 5/5 | Objective: 6.08461e+11 | Objective improvement: 0.0275563 | Shift: 17.442 | Split: 0 | Recall: 0 [BLAS-only] + + +========== PROFILER RESULTS ========== +assign_f32 112.479s (44.681%) [5 calls] +assign_i8 63.401s (25.185%) [5 calls] + - i8_knn 61.786s (24.544%) [5 calls] +i8_knn 61.785s (24.543%) [5 calls] + - matmul 57.860s (22.984%) [735 calls] + - top_k 3.912s (1.554%) [735 calls] + - pack 0.001s (0.000%) [735 calls] +update_centroids 7.229s (2.871%) [5 calls] +rotator 4.141s (1.645%) [3 calls] +norms_calc 1.813s (0.720%) [13 calls] +quantize_data 0.710s (0.282%) +consolidate 0.049s (0.019%) [5 calls] + - pdxify 0.052s (0.021%) [6 calls] + - splitting 0.002s (0.001%) [5 calls] + - normalize 0.000s (0.000%) [5 calls] +generating_centroids 0.047s (0.019%) +fill 0.027s (0.011%) [10 calls] +quantize_centroids 0.019s (0.007%) [5 calls] +shift 0.016s (0.006%) [5 calls] +unrotator 0.014s (0.005%) +compare_assignments 0.007s (0.003%) [5 calls] +compute_cost 0.002s (0.001%) [5 calls] +allocator 0.000s (0.000%) +------------------------------------------- +TOTAL 251.737s +=========================================== + +Training completed in 189272.002 ms +Actual iterations: 5 (requested: 5) +Final objective: 608460800000.000 + +--- Computing Recall --- +Ground truth file: /Users/jzh/research/SuperKMeans/benchmarks/ground_truth/cohere2m.json +Queries file: /Users/jzh/research/SuperKMeans/benchmarks/data/data_cohere2m_test.bin +Using 1000 queries (loaded 1000 from ground truth) +Cluster size stats: mean=353.607, gmean=281.562, std=227.723, CV=0.644, min=8, max=2740 + +--- Recall@10 --- +Recall@ 5 ( 0.10% centroids, 2894 avg vectors): 0.6197 ± 0.3485 +Recall@ 11 ( 0.20% centroids, 6342 avg vectors): 0.7118 ± 0.3183 +Recall@ 16 ( 0.30% centroids, 9231 avg vectors): 0.7537 ± 0.3011 +Recall@ 22 ( 0.40% centroids, 12667 avg vectors): 0.7821 ± 0.2865 +Recall@ 28 ( 0.50% centroids, 16088 avg vectors): 0.8023 ± 0.2722 +Recall@ 33 ( 0.60% centroids, 18964 avg vectors): 0.8151 ± 0.2631 +Recall@ 39 ( 0.70% centroids, 22388 avg vectors): 0.8269 ± 0.2567 +Recall@ 45 ( 0.80% centroids, 25782 avg vectors): 0.8379 ± 0.2484 +Recall@ 50 ( 0.90% centroids, 28614 avg vectors): 0.8484 ± 0.2414 +Recall@ 56 ( 1.00% centroids, 32041 avg vectors): 0.8574 ± 0.2324 +Recall@ 70 ( 1.25% centroids, 39931 avg vectors): 0.8728 ± 0.2175 +Recall@ 84 ( 1.50% centroids, 47762 avg vectors): 0.8832 ± 0.2098 +Recall@ 98 ( 1.75% centroids, 55554 avg vectors): 0.8925 ± 0.2012 +Recall@ 113 ( 2.00% centroids, 63861 avg vectors): 0.8989 ± 0.1947 +Recall@ 127 ( 2.25% centroids, 71554 avg vectors): 0.9048 ± 0.1884 +Recall@ 141 ( 2.50% centroids, 79228 avg vectors): 0.9094 ± 0.1818 +Recall@ 155 ( 2.75% centroids, 86888 avg vectors): 0.9131 ± 0.1785 +Recall@ 169 ( 3.00% centroids, 94466 avg vectors): 0.9179 ± 0.1730 +Recall@ 183 ( 3.25% centroids, 102100 avg vectors): 0.9239 ± 0.1651 +Recall@ 197 ( 3.50% centroids, 109666 avg vectors): 0.9272 ± 0.1603 +Recall@ 212 ( 3.75% centroids, 117777 avg vectors): 0.9309 ± 0.1543 +Recall@ 226 ( 4.00% centroids, 125213 avg vectors): 0.9343 ± 0.1498 +Recall@ 240 ( 4.25% centroids, 132720 avg vectors): 0.9369 ± 0.1452 +Recall@ 254 ( 4.50% centroids, 140198 avg vectors): 0.9386 ± 0.1429 +Recall@ 268 ( 4.75% centroids, 147647 avg vectors): 0.9406 ± 0.1410 +Recall@ 282 ( 5.00% centroids, 155039 avg vectors): 0.9429 ± 0.1370 +Recall@ 565 (10.00% centroids, 300209 avg vectors): 0.9694 ± 0.0981 + +--- Recall@100 --- +Recall@ 5 ( 0.10% centroids, 2894 avg vectors): 0.5502 ± 0.2816 +Recall@ 11 ( 0.20% centroids, 6342 avg vectors): 0.6546 ± 0.2636 +Recall@ 16 ( 0.30% centroids, 9231 avg vectors): 0.7005 ± 0.2513 +Recall@ 22 ( 0.40% centroids, 12667 avg vectors): 0.7330 ± 0.2409 +Recall@ 28 ( 0.50% centroids, 16088 avg vectors): 0.7587 ± 0.2305 +Recall@ 33 ( 0.60% centroids, 18964 avg vectors): 0.7750 ± 0.2227 +Recall@ 39 ( 0.70% centroids, 22388 avg vectors): 0.7902 ± 0.2156 +Recall@ 45 ( 0.80% centroids, 25782 avg vectors): 0.8039 ± 0.2077 +Recall@ 50 ( 0.90% centroids, 28614 avg vectors): 0.8140 ± 0.2014 +Recall@ 56 ( 1.00% centroids, 32041 avg vectors): 0.8229 ± 0.1958 +Recall@ 70 ( 1.25% centroids, 39931 avg vectors): 0.8418 ± 0.1841 +Recall@ 84 ( 1.50% centroids, 47762 avg vectors): 0.8544 ± 0.1756 +Recall@ 98 ( 1.75% centroids, 55554 avg vectors): 0.8658 ± 0.1680 +Recall@ 113 ( 2.00% centroids, 63861 avg vectors): 0.8751 ± 0.1610 +Recall@ 127 ( 2.25% centroids, 71554 avg vectors): 0.8830 ± 0.1555 +Recall@ 141 ( 2.50% centroids, 79228 avg vectors): 0.8907 ± 0.1470 +Recall@ 155 ( 2.75% centroids, 86888 avg vectors): 0.8964 ± 0.1424 +Recall@ 169 ( 3.00% centroids, 94466 avg vectors): 0.9020 ± 0.1369 +Recall@ 183 ( 3.25% centroids, 102100 avg vectors): 0.9072 ± 0.1326 +Recall@ 197 ( 3.50% centroids, 109666 avg vectors): 0.9114 ± 0.1290 +Recall@ 212 ( 3.75% centroids, 117777 avg vectors): 0.9156 ± 0.1245 +Recall@ 226 ( 4.00% centroids, 125213 avg vectors): 0.9197 ± 0.1207 +Recall@ 240 ( 4.25% centroids, 132720 avg vectors): 0.9232 ± 0.1165 +Recall@ 254 ( 4.50% centroids, 140198 avg vectors): 0.9263 ± 0.1127 +Recall@ 268 ( 4.75% centroids, 147647 avg vectors): 0.9291 ± 0.1097 +Recall@ 282 ( 5.00% centroids, 155039 avg vectors): 0.9324 ± 0.1060 +Recall@ 565 (10.00% centroids, 300209 avg vectors): 0.9623 ± 0.0715 +Results written to: /Users/jzh/research/SuperKMeans/benchmarks/results/default/end_to_end.csv \ No newline at end of file diff --git a/include/superkmeans/superkmeans.h b/include/superkmeans/superkmeans.h index 1eea344..a39bd45 100644 --- a/include/superkmeans/superkmeans.h +++ b/include/superkmeans/superkmeans.h @@ -948,9 +948,9 @@ class SuperKMeans { size_t& iter_idx, const bool is_first_iter, std::vector& target_stats, - const bool run_quantized_assignment = true, + const bool run_quantized_assignment = false, const bool run_f32_assignment = true, - const bool use_quantized_assignment_for_update = true + const bool use_quantized_assignment_for_update = false ) { if (!is_first_iter) { std::swap(horizontal_centroids, prev_centroids); @@ -983,7 +983,6 @@ class SuperKMeans { if (run_quantized_assignment && run_f32_assignment) { SKM_PROFILE_SCOPE("compare_assignments"); size_t match = 0; - size_t mismatches_printed = 0; for (size_t s = 0; s < n_samples; ++s) { if (i8_assignments[s] == assignments[s]) { ++match;