From 09c8b25dd20c7c42cb7c354f7c2b0b75ab0f29ef Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sat, 27 Jun 2026 11:25:23 -0700 Subject: [PATCH 1/6] int8 quantization --- Common/include/linear_algebra/CSysMatrix.hpp | 25 +++++++++ Common/include/linear_algebra/CSysMatrix.inl | 56 ++++++++++++++++--- Common/src/linear_algebra/CSysMatrix.cpp | 41 +++++++++++++- .../include/solvers/CFVMFlowSolverBase.inl | 1 + 4 files changed, 115 insertions(+), 8 deletions(-) diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index ecb26959a06..2104f6b8908 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -33,6 +33,7 @@ #include "CPastixWrapper.hpp" #include "../toolboxes/graph_toolbox.hpp" +#include #include #include #include @@ -125,6 +126,16 @@ class CSysMatrix { * size is needed for fast, per-thread, static memory allocation. */ enum : size_t { MAXNVAR = 20 }; + /*--- Quantized off-diagonal storage. ---*/ + using QuantType = int8_t; + /*! \brief When true, off-diagonal blocks are read from the quantized representation + * during matrix-vector products and LU-SGS; the diagonal block always uses + * the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */ + static constexpr bool USE_QUANTIZATION = true; + + ScalarType* q_scale; /*!< \brief Scale of each block row (full precision), [nnz * nVar]. */ + QuantType* q_offdiag; /*!< \brief Quantized off-diagonal entries per block, [nnz * nVar * nVar]. */ + enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */ enum { OMP_MAX_SIZE_H = 512 }; /*!< \brief Max. chunk size used in heavy parallel for loops. */ enum { OMP_MIN_SIZE = 32 }; /*!< \brief Chunk size for finer grain operations. */ @@ -372,6 +383,15 @@ class CSysMatrix { */ void RowProduct(const CSysVector& vec, unsigned long row_i, ScalarType* prod) const; + /*! + * \brief Computes product += A_k * vec using the quantized representation of block k. + * \note Only valid after QuantizeOffDiagonalBlocks() has been called. + * \param[in] k - Block index in the CSR flat storage. + * \param[in] vec - Input vector (nEqn entries). + * \param[in,out] prod - Accumulation output (nVar entries). + */ + inline void QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, ScalarType* prod) const; + public: /*! * \brief Constructor of the class. @@ -398,6 +418,11 @@ class CSysMatrix { bool EdgeConnect, CGeometry* geometry, const CConfig* config, bool needTranspPtr = false, bool grad_mode = false); + /*! + * \brief Compresses off-diagonal blocks into quantized form for use with USE_QUANTIZATION. + */ + void QuantizeOffDiagonalBlocks(); + /*! * \brief Sets to zero all the entries of the sparse matrix. */ diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl index 163e6fb08da..56de5ae428b 100644 --- a/Common/include/linear_algebra/CSysMatrix.inl +++ b/Common/include/linear_algebra/CSysMatrix.inl @@ -145,8 +145,18 @@ template FORCEINLINE void CSysMatrix::Gauss_Elimination(unsigned long block_i, ScalarType* rhs) const { /*--- Copy block, as the algorithm modifies the matrix ---*/ ScalarType block[MAXNVAR * MAXNVAR]; - MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block); - + if (USE_QUANTIZATION && nVar > 1) { + const auto k = dia_ptr[block_i]; + const QuantType* q = &q_offdiag[k * nVar * nVar]; + const ScalarType* scale = &q_scale[k * nVar]; + for (auto r = 0ul; r < nVar; ++r) { + for (auto c = 0ul; c < nVar; ++c) { + block[r * nVar + c] = scale[r] * q[r * nVar + c]; + } + } + } else { + MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block); + } Gauss_Elimination(block, rhs); } @@ -169,6 +179,18 @@ FORCEINLINE const ScalarType* CSysMatrix::InvertDiagonalBlockILUMatr return Uii; } +template +FORCEINLINE void CSysMatrix::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, + ScalarType* prod) const { + const ScalarType* scale = &q_scale[k * nVar]; + const QuantType* q = &q_offdiag[k * nVar * nVar]; + for (auto r = 0ul; r < nVar; ++r) { + auto sum = ScalarType(0); + for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c]; + prod[r] += scale[r] * sum; + } +} + template FORCEINLINE void CSysMatrix::RowProduct(const CSysVector& vec, unsigned long row_i, ScalarType* prod) const { @@ -176,7 +198,11 @@ FORCEINLINE void CSysMatrix::RowProduct(const CSysVector for (auto index = row_ptr[row_i]; index < row_ptr[row_i + 1]; index++) { auto col_j = col_ind[index]; - MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + if (USE_QUANTIZATION && nVar > 1) { + QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod); + } else { + MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + } } } @@ -188,8 +214,13 @@ FORCEINLINE void CSysMatrix::UpperProduct(const CSysVector= nPointDomain) - MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + if (col_j < col_ub || col_j >= nPointDomain) { + if (USE_QUANTIZATION && nVar > 1) { + QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod); + } else { + MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + } + } } } @@ -200,12 +231,23 @@ FORCEINLINE void CSysMatrix::LowerProduct(const CSysVector= col_lb) MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + if (col_j >= col_lb) { + if (USE_QUANTIZATION && nVar > 1) { + QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod); + } else { + MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod); + } + } } } template FORCEINLINE void CSysMatrix::DiagonalProduct(const CSysVector& vec, unsigned long row_i, ScalarType* prod) const { - MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod); + if (USE_QUANTIZATION && nVar > 1) { + for (auto iVar = 0ul; iVar < nVar; iVar++) prod[iVar] = 0.0; + QuantizedMatVecAdd(dia_ptr[row_i], &vec[row_i * nEqn], prod); + } else { + MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod); + } } diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index 00ef51c2023..e71ea138468 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -31,6 +31,7 @@ #include "../../include/toolboxes/allocation_toolbox.hpp" #include +#include template CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) { @@ -48,6 +49,9 @@ CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G col_ind = nullptr; col_ptr = nullptr; + q_offdiag = nullptr; + q_scale = nullptr; + ILU_matrix = nullptr; row_ptr_ilu = nullptr; dia_ptr_ilu = nullptr; @@ -71,6 +75,8 @@ CSysMatrix::~CSysMatrix() { MemoryAllocation::aligned_free(ILU_matrix); MemoryAllocation::aligned_free(matrix); MemoryAllocation::aligned_free(invM); + MemoryAllocation::aligned_free(q_offdiag); + MemoryAllocation::aligned_free(q_scale); if (useCuda) { GPUMemoryAllocation::gpu_free(d_matrix); @@ -488,6 +494,40 @@ void CSysMatrixComms::Complete(CSysVector& x, CGeometry* geometry, const CCon #endif } +template +void CSysMatrix::QuantizeOffDiagonalBlocks() { + SU2_ZONE_SCOPED + + if (nVar == 1) return; + + if (q_scale == nullptr) { + q_offdiag = MemoryAllocation::aligned_alloc(64, nnz * nVar * nVar * sizeof(QuantType)); + q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(ScalarType)); + } + + SU2_OMP_FOR_DYN(omp_heavy_size) + for (auto k = 0ul; k < nnz; ++k) { + for (auto r = 0ul; r < nVar; ++r) { + const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar]; + ScalarType& __restrict scale = q_scale[k * nVar + r]; + QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar]; + + /*--- Max absolute entry in each row as the scale. ---*/ + scale = EPS; + for (auto c = 0ul; c < nVar; ++c) { + scale = fmax(scale, fabs(row[c])); + } + scale /= std::numeric_limits::max(); + + const ScalarType inv_s = 1 / scale; + for (auto c = 0ul; c < nVar; ++c) { + q_row[c] = static_cast(std::round(SU2_TYPE::GetValue(row[c] * inv_s))); + } + } + } + END_SU2_OMP_FOR +} + template void CSysMatrix::SetValZero() { SU2_ZONE_SCOPED @@ -628,7 +668,6 @@ void CSysMatrix::MatrixInverse(ScalarType* matrix, ScalarType* inver template void CSysMatrix::DeleteValsRowi(unsigned long block_i, unsigned long row) { - SU2_ZONE_SCOPED for (auto index = row_ptr[block_i]; index < row_ptr[block_i + 1]; index++) { for (auto iVar = 0u; iVar < nVar; iVar++) matrix[index * nVar * nVar + row * nVar + iVar] = 0.0; // Delete row values in the block diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl index 05ec25a0e56..87b158e45b3 100644 --- a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl +++ b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl @@ -649,6 +649,7 @@ void CFVMFlowSolverBase::ImplicitEuler_Iteration(CGeometry *geometry, CSol } END_SU2_OMP_FOR + Jacobian.QuantizeOffDiagonalBlocks(); auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config); BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS { From 9d44c54e8bd9943ec8296c0c71560239507c2d50 Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sat, 27 Jun 2026 11:27:21 -0700 Subject: [PATCH 2/6] [skip ci] From 5e162270a14a7fdf22c2c1395b2c2fe4525da4e9 Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sat, 27 Jun 2026 11:32:24 -0700 Subject: [PATCH 3/6] [skip ci] --- SU2_CFD/include/solvers/CScalarSolver.inl | 1 + 1 file changed, 1 insertion(+) diff --git a/SU2_CFD/include/solvers/CScalarSolver.inl b/SU2_CFD/include/solvers/CScalarSolver.inl index 7d6871f8ca4..5847879f7b5 100644 --- a/SU2_CFD/include/solvers/CScalarSolver.inl +++ b/SU2_CFD/include/solvers/CScalarSolver.inl @@ -572,6 +572,7 @@ void CScalarSolver::ImplicitEuler_Iteration(CGeometry* geometry, C } END_SU2_OMP_FOR + Jacobian.QuantizeOffDiagonalBlocks(); auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config); BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS { From 1e42de1ba374313b0c03846d06e32fb742e72b5f Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sat, 27 Jun 2026 20:49:51 -0700 Subject: [PATCH 4/6] full crazy --- Common/include/linear_algebra/CSysMatrix.hpp | 5 +-- Common/include/linear_algebra/CSysMatrix.inl | 34 ++++++++++++++---- Common/src/linear_algebra/CSysMatrix.cpp | 37 +++++++++++++------- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index 2104f6b8908..1b6beaff213 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -133,8 +133,9 @@ class CSysMatrix { * the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */ static constexpr bool USE_QUANTIZATION = true; - ScalarType* q_scale; /*!< \brief Scale of each block row (full precision), [nnz * nVar]. */ - QuantType* q_offdiag; /*!< \brief Quantized off-diagonal entries per block, [nnz * nVar * nVar]. */ + ScalarType* q_bscale; /*!< \brief Per-block scale (full precision), [nnz]. */ + uint8_t* q_scale; /*!< \brief Per-row scale relative to q_bscale as custom float8 (5e|3s), [nnz * nVar]. */ + QuantType* q_offdiag; /*!< \brief Quantized entries per block, [nnz * nVar * nVar]. */ enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */ enum { OMP_MAX_SIZE_H = 512 }; /*!< \brief Max. chunk size used in heavy parallel for loops. */ diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl index 56de5ae428b..f6e0b6a65dc 100644 --- a/Common/include/linear_algebra/CSysMatrix.inl +++ b/Common/include/linear_algebra/CSysMatrix.inl @@ -82,6 +82,24 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) { } } } +/*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e). + * Exponent is always non-negative so all values are in (0, ~1.875]. + * No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/ +inline uint8_t float8_encode(double v) { + if (v <= 0.0) return static_cast(31 << 3); // clamp to minimum + int fe; + const double m = std::frexp(v, &fe); + int e = 1 - fe; + int s = static_cast(std::round((2.0 * m - 1.0) * 8.0)); + if (s >= 8) { + --e; + s = 0; + } // carry: move to next higher power-of-two band (larger value → smaller e) + return static_cast((std::max(0, std::min(31, e)) << 3) | std::max(0, std::min(7, s))); +} + +inline double float8_decode(uint8_t b) { return std::ldexp(1.0 + (b & 7) * 0.125, -(b >> 3)); } + } // namespace #define __MATVECPROD_SIGNATURE__(TYPE, NAME) \ @@ -147,12 +165,12 @@ FORCEINLINE void CSysMatrix::Gauss_Elimination(unsigned long block_i ScalarType block[MAXNVAR * MAXNVAR]; if (USE_QUANTIZATION && nVar > 1) { const auto k = dia_ptr[block_i]; + const ScalarType bscale = q_bscale[k]; + const uint8_t* qscale = &q_scale[k * nVar]; const QuantType* q = &q_offdiag[k * nVar * nVar]; - const ScalarType* scale = &q_scale[k * nVar]; for (auto r = 0ul; r < nVar; ++r) { - for (auto c = 0ul; c < nVar; ++c) { - block[r * nVar + c] = scale[r] * q[r * nVar + c]; - } + const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r])); + for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * static_cast(q[r * nVar + c]); } } else { MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block); @@ -182,12 +200,14 @@ FORCEINLINE const ScalarType* CSysMatrix::InvertDiagonalBlockILUMatr template FORCEINLINE void CSysMatrix::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, ScalarType* prod) const { - const ScalarType* scale = &q_scale[k * nVar]; + const ScalarType bscale = q_bscale[k]; + const uint8_t* qscale = &q_scale[k * nVar]; const QuantType* q = &q_offdiag[k * nVar * nVar]; for (auto r = 0ul; r < nVar; ++r) { + const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r])); auto sum = ScalarType(0); - for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c]; - prod[r] += scale[r] * sum; + for (auto c = 0ul; c < nVar; ++c) sum += static_cast(q[r * nVar + c]) * vec[c]; + prod[r] += row_scale * sum; } } diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index e71ea138468..86f7a752c00 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -49,6 +49,7 @@ CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G col_ind = nullptr; col_ptr = nullptr; + q_bscale = nullptr; q_offdiag = nullptr; q_scale = nullptr; @@ -75,6 +76,7 @@ CSysMatrix::~CSysMatrix() { MemoryAllocation::aligned_free(ILU_matrix); MemoryAllocation::aligned_free(matrix); MemoryAllocation::aligned_free(invM); + MemoryAllocation::aligned_free(q_bscale); MemoryAllocation::aligned_free(q_offdiag); MemoryAllocation::aligned_free(q_scale); @@ -500,28 +502,39 @@ void CSysMatrix::QuantizeOffDiagonalBlocks() { if (nVar == 1) return; - if (q_scale == nullptr) { + if (q_bscale == nullptr) { + q_bscale = MemoryAllocation::aligned_alloc(64, nnz * sizeof(ScalarType)); + q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(uint8_t)); q_offdiag = MemoryAllocation::aligned_alloc(64, nnz * nVar * nVar * sizeof(QuantType)); - q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(ScalarType)); } + constexpr double q_max = std::numeric_limits::max(); SU2_OMP_FOR_DYN(omp_heavy_size) for (auto k = 0ul; k < nnz; ++k) { + const ScalarType* __restrict blk = &matrix[k * nVar * nVar]; + + /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/ + double max_blk = EPS; + for (auto idx = 0ul; idx < nVar * nVar; ++idx) max_blk = std::max(max_blk, std::abs(SU2_TYPE::GetValue(blk[idx]))); + q_bscale[k] = ScalarType(max_blk / q_max); + for (auto r = 0ul; r < nVar; ++r) { - const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar]; - ScalarType& __restrict scale = q_scale[k * nVar + r]; + const ScalarType* __restrict row = &blk[r * nVar]; QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar]; - /*--- Max absolute entry in each row as the scale. ---*/ - scale = EPS; - for (auto c = 0ul; c < nVar; ++c) { - scale = fmax(scale, fabs(row[c])); - } - scale /= std::numeric_limits::max(); + /*--- Per-row max → encode ratio to block max as float8. ---*/ + double max_row = 0.0; + for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, std::abs(SU2_TYPE::GetValue(row[c]))); + q_scale[k * nVar + r] = float8_encode(max_row / max_blk); + + /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/ + const double eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]); + const ScalarType inv_rscale = (eff > 0.0) ? ScalarType(1.0 / eff) : ScalarType(0); - const ScalarType inv_s = 1 / scale; for (auto c = 0ul; c < nVar; ++c) { - q_row[c] = static_cast(std::round(SU2_TYPE::GetValue(row[c] * inv_s))); + const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale)); + q_row[c] = static_cast(std::max(double(std::numeric_limits::min()), + std::min(double(std::numeric_limits::max()), qval))); } } } From 14116a4815ab952d55c9b252efdcb412be33d345 Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sat, 27 Jun 2026 21:15:31 -0700 Subject: [PATCH 5/6] micro optimize --- Common/include/linear_algebra/CSysMatrix.inl | 27 ++++++++++++++------ Common/src/linear_algebra/CSysMatrix.cpp | 17 ++++++------ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl index f6e0b6a65dc..217304b0064 100644 --- a/Common/include/linear_algebra/CSysMatrix.inl +++ b/Common/include/linear_algebra/CSysMatrix.inl @@ -85,20 +85,31 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) { /*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e). * Exponent is always non-negative so all values are in (0, ~1.875]. * No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/ +inline uint8_t float8_encode(float v) { + uint32_t bits; + memcpy(&bits, &v, sizeof(bits)); + int e = 127 - static_cast((bits >> 23) & 0xFFu); + int s = static_cast((bits >> 20) & 7u) + static_cast((bits >> 19) & 1u); + if (s >= 8) { + --e; + s = 0; + } + return static_cast((std::max(0, std::min(31, e)) << 3) | s); +} + inline uint8_t float8_encode(double v) { - if (v <= 0.0) return static_cast(31 << 3); // clamp to minimum - int fe; - const double m = std::frexp(v, &fe); - int e = 1 - fe; - int s = static_cast(std::round((2.0 * m - 1.0) * 8.0)); + uint64_t bits; + memcpy(&bits, &v, sizeof(bits)); + int e = 1023 - static_cast((bits >> 52) & 0x7FFu); + int s = static_cast((bits >> 49) & 7u) + static_cast((bits >> 48) & 1u); if (s >= 8) { --e; s = 0; - } // carry: move to next higher power-of-two band (larger value → smaller e) - return static_cast((std::max(0, std::min(31, e)) << 3) | std::max(0, std::min(7, s))); + } + return static_cast((std::max(0, std::min(31, e)) << 3) | s); } -inline double float8_decode(uint8_t b) { return std::ldexp(1.0 + (b & 7) * 0.125, -(b >> 3)); } +inline float float8_decode(uint8_t b) { return std::ldexp(1.0f + (b & 7) * 0.125f, -(b >> 3)); } } // namespace diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index 86f7a752c00..6a6a4279eed 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -507,29 +507,30 @@ void CSysMatrix::QuantizeOffDiagonalBlocks() { q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(uint8_t)); q_offdiag = MemoryAllocation::aligned_alloc(64, nnz * nVar * nVar * sizeof(QuantType)); } - constexpr double q_max = std::numeric_limits::max(); + const ScalarType q_max = std::numeric_limits::max(); SU2_OMP_FOR_DYN(omp_heavy_size) for (auto k = 0ul; k < nnz; ++k) { const ScalarType* __restrict blk = &matrix[k * nVar * nVar]; /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/ - double max_blk = EPS; - for (auto idx = 0ul; idx < nVar * nVar; ++idx) max_blk = std::max(max_blk, std::abs(SU2_TYPE::GetValue(blk[idx]))); - q_bscale[k] = ScalarType(max_blk / q_max); + ScalarType max_blk = EPS; + for (auto idx = 0ul; idx < nVar * nVar; ++idx) + max_blk = std::max(max_blk, ScalarType(std::abs(SU2_TYPE::GetValue(blk[idx])))); + q_bscale[k] = max_blk / q_max; for (auto r = 0ul; r < nVar; ++r) { const ScalarType* __restrict row = &blk[r * nVar]; QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar]; /*--- Per-row max → encode ratio to block max as float8. ---*/ - double max_row = 0.0; - for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, std::abs(SU2_TYPE::GetValue(row[c]))); + auto max_row = ScalarType(0); + for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, ScalarType(std::abs(SU2_TYPE::GetValue(row[c])))); q_scale[k * nVar + r] = float8_encode(max_row / max_blk); /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/ - const double eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]); - const ScalarType inv_rscale = (eff > 0.0) ? ScalarType(1.0 / eff) : ScalarType(0); + const ScalarType eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]); + const ScalarType inv_rscale = (eff > ScalarType(0)) ? ScalarType(1) / eff : ScalarType(0); for (auto c = 0ul; c < nVar; ++c) { const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale)); From c86a8a23e784e43ff96759b7918ed34a3c9c430d Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Sun, 28 Jun 2026 18:10:20 -0700 Subject: [PATCH 6/6] compress more [skip ci] --- .../basic_types/datatype_structure.hpp | 4 ++ Common/include/linear_algebra/CSysMatrix.hpp | 3 +- Common/include/linear_algebra/CSysMatrix.inl | 49 ++++--------------- Common/src/linear_algebra/CSysMatrix.cpp | 47 +++++++++--------- Common/src/linear_algebra/CSysSolve.cpp | 12 +++-- 5 files changed, 45 insertions(+), 70 deletions(-) diff --git a/Common/include/basic_types/datatype_structure.hpp b/Common/include/basic_types/datatype_structure.hpp index 623c410e551..d9b6bb0b58f 100644 --- a/Common/include/basic_types/datatype_structure.hpp +++ b/Common/include/basic_types/datatype_structure.hpp @@ -135,6 +135,10 @@ template <> struct Passive { FORCEINLINE static passivedouble Value(const su2double& val) { return GetValue(val); } }; +template +FORCEINLINE auto PassiveValue(const T& val) { + return Passive::Value(val); +} /*! * \brief Casts the primitive value to int (uses GetValue, already implemented for each type). diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index 1b6beaff213..1d06f457d43 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -133,8 +133,7 @@ class CSysMatrix { * the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */ static constexpr bool USE_QUANTIZATION = true; - ScalarType* q_bscale; /*!< \brief Per-block scale (full precision), [nnz]. */ - uint8_t* q_scale; /*!< \brief Per-row scale relative to q_bscale as custom float8 (5e|3s), [nnz * nVar]. */ + QuantType* q_scale; /*!< \brief Per-row signed binary exponent e; scale = 2^e, [nnz * nVar]. */ QuantType* q_offdiag; /*!< \brief Quantized entries per block, [nnz * nVar * nVar]. */ enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */ diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl index 217304b0064..713d3db1a4b 100644 --- a/Common/include/linear_algebra/CSysMatrix.inl +++ b/Common/include/linear_algebra/CSysMatrix.inl @@ -82,35 +82,6 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) { } } } -/*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e). - * Exponent is always non-negative so all values are in (0, ~1.875]. - * No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/ -inline uint8_t float8_encode(float v) { - uint32_t bits; - memcpy(&bits, &v, sizeof(bits)); - int e = 127 - static_cast((bits >> 23) & 0xFFu); - int s = static_cast((bits >> 20) & 7u) + static_cast((bits >> 19) & 1u); - if (s >= 8) { - --e; - s = 0; - } - return static_cast((std::max(0, std::min(31, e)) << 3) | s); -} - -inline uint8_t float8_encode(double v) { - uint64_t bits; - memcpy(&bits, &v, sizeof(bits)); - int e = 1023 - static_cast((bits >> 52) & 0x7FFu); - int s = static_cast((bits >> 49) & 7u) + static_cast((bits >> 48) & 1u); - if (s >= 8) { - --e; - s = 0; - } - return static_cast((std::max(0, std::min(31, e)) << 3) | s); -} - -inline float float8_decode(uint8_t b) { return std::ldexp(1.0f + (b & 7) * 0.125f, -(b >> 3)); } - } // namespace #define __MATVECPROD_SIGNATURE__(TYPE, NAME) \ @@ -176,12 +147,13 @@ FORCEINLINE void CSysMatrix::Gauss_Elimination(unsigned long block_i ScalarType block[MAXNVAR * MAXNVAR]; if (USE_QUANTIZATION && nVar > 1) { const auto k = dia_ptr[block_i]; - const ScalarType bscale = q_bscale[k]; - const uint8_t* qscale = &q_scale[k * nVar]; - const QuantType* q = &q_offdiag[k * nVar * nVar]; + const QuantType* __restrict qscale = &q_scale[k * nVar]; + const QuantType* __restrict q = &q_offdiag[k * nVar * nVar]; for (auto r = 0ul; r < nVar; ++r) { - const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r])); - for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * static_cast(q[r * nVar + c]); + const uint32_t rs_bits = static_cast(std::max(0, static_cast(qscale[r]) + 127)) << 23; + float row_scale; + memcpy(&row_scale, &rs_bits, sizeof(rs_bits)); + for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * q[r * nVar + c]; } } else { MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block); @@ -211,13 +183,12 @@ FORCEINLINE const ScalarType* CSysMatrix::InvertDiagonalBlockILUMatr template FORCEINLINE void CSysMatrix::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, ScalarType* prod) const { - const ScalarType bscale = q_bscale[k]; - const uint8_t* qscale = &q_scale[k * nVar]; - const QuantType* q = &q_offdiag[k * nVar * nVar]; + const QuantType* __restrict qscale = &q_scale[k * nVar]; + const QuantType* __restrict q = &q_offdiag[k * nVar * nVar]; for (auto r = 0ul; r < nVar; ++r) { - const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r])); + const auto row_scale = ldexpf(1, static_cast(qscale[r])); auto sum = ScalarType(0); - for (auto c = 0ul; c < nVar; ++c) sum += static_cast(q[r * nVar + c]) * vec[c]; + for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c]; prod[r] += row_scale * sum; } } diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp index 6a6a4279eed..95e15f76667 100644 --- a/Common/src/linear_algebra/CSysMatrix.cpp +++ b/Common/src/linear_algebra/CSysMatrix.cpp @@ -49,7 +49,6 @@ CSysMatrix::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G col_ind = nullptr; col_ptr = nullptr; - q_bscale = nullptr; q_offdiag = nullptr; q_scale = nullptr; @@ -76,7 +75,6 @@ CSysMatrix::~CSysMatrix() { MemoryAllocation::aligned_free(ILU_matrix); MemoryAllocation::aligned_free(matrix); MemoryAllocation::aligned_free(invM); - MemoryAllocation::aligned_free(q_bscale); MemoryAllocation::aligned_free(q_offdiag); MemoryAllocation::aligned_free(q_scale); @@ -502,40 +500,39 @@ void CSysMatrix::QuantizeOffDiagonalBlocks() { if (nVar == 1) return; - if (q_bscale == nullptr) { - q_bscale = MemoryAllocation::aligned_alloc(64, nnz * sizeof(ScalarType)); - q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(uint8_t)); + if (q_scale == nullptr) { + q_scale = MemoryAllocation::aligned_alloc(64, nnz * nVar * sizeof(QuantType)); q_offdiag = MemoryAllocation::aligned_alloc(64, nnz * nVar * nVar * sizeof(QuantType)); } - const ScalarType q_max = std::numeric_limits::max(); SU2_OMP_FOR_DYN(omp_heavy_size) for (auto k = 0ul; k < nnz; ++k) { - const ScalarType* __restrict blk = &matrix[k * nVar * nVar]; - - /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/ - ScalarType max_blk = EPS; - for (auto idx = 0ul; idx < nVar * nVar; ++idx) - max_blk = std::max(max_blk, ScalarType(std::abs(SU2_TYPE::GetValue(blk[idx])))); - q_bscale[k] = max_blk / q_max; - for (auto r = 0ul; r < nVar; ++r) { - const ScalarType* __restrict row = &blk[r * nVar]; + const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar]; QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar]; - /*--- Per-row max → encode ratio to block max as float8. ---*/ - auto max_row = ScalarType(0); - for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, ScalarType(std::abs(SU2_TYPE::GetValue(row[c])))); - q_scale[k * nVar + r] = float8_encode(max_row / max_blk); + /*--- Integer max over sign-cleared float bits gives max-abs with no float compare. + * The biased exponent is then free — no extra extraction step needed. ---*/ + constexpr uint32_t eps_bits = 0x34000000u; // FLT_EPSILON ≈ 1.19e-7, floor for degenerate rows + uint32_t max_abs_bits = eps_bits; + for (auto c = 0ul; c < nVar; ++c) { + uint32_t fb; + const float fv = SU2_TYPE::PassiveValue(row[c]); + memcpy(&fb, &fv, sizeof(fb)); + max_abs_bits = std::max(max_abs_bits, fb & 0x7FFFFFFFu); + } - /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/ - const ScalarType eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]); - const ScalarType inv_rscale = (eff > ScalarType(0)) ? ScalarType(1) / eff : ScalarType(0); + /*--- e = biased_exp - 133 puts quantized max-abs in [64,128). ---*/ + const int e_clamped = std::min(127, std::max(-128, static_cast(max_abs_bits >> 23) - 133)); + q_scale[k * nVar + r] = static_cast(e_clamped); + /*--- 2^(-e) is a float with biased_exp = (127-e), zero mantissa — no ldexpf needed. ---*/ + const uint32_t inv_bits = static_cast(127 - e_clamped) << 23; + float inv_rscale; + memcpy(&inv_rscale, &inv_bits, sizeof(inv_rscale)); for (auto c = 0ul; c < nVar; ++c) { - const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale)); - q_row[c] = static_cast(std::max(double(std::numeric_limits::min()), - std::min(double(std::numeric_limits::max()), qval))); + q_row[c] = static_cast( + std::max(-128.f, std::min(127.f, roundf(SU2_TYPE::PassiveValue(row[c]) * inv_rscale)))); } } } diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp index cd100f311ab..2a58215d2f8 100644 --- a/Common/src/linear_algebra/CSysSolve.cpp +++ b/Common/src/linear_algebra/CSysSolve.cpp @@ -212,11 +212,15 @@ bool CSysSolve::ModGramSchmidt(bool shared_hsbg, int i, su2matrix::multiDot(w, i + 1, 1, w, i + 1); - LinearCombination( - shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true); + if (i < 5) { + for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k)); + } else { + const auto& dh_i = CSysVector::multiDot(w, i + 1, 1, w, i + 1); + LinearCombination( + shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true); - for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k)); + for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k)); + } /*--- The norm of w[i+1] is 0 or NaN: the input vector from mat_vec is * zero or contains NaN. Cannot proceed with orthogonalization. ---*/