From 09c8b25dd20c7c42cb7c354f7c2b0b75ab0f29ef Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 27 Jun 2026 11:25:23 -0700
Subject: [PATCH 1/6] int8 quantization

---
 Common/include/linear_algebra/CSysMatrix.hpp  | 25 +++++++++
 Common/include/linear_algebra/CSysMatrix.inl  | 56 ++++++++++++++++---
 Common/src/linear_algebra/CSysMatrix.cpp      | 41 +++++++++++++-
 .../include/solvers/CFVMFlowSolverBase.inl    |  1 +
 4 files changed, 115 insertions(+), 8 deletions(-)
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index ecb26959a06..2104f6b8908 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -33,6 +33,7 @@
 #include "CPastixWrapper.hpp"
 #include "../toolboxes/graph_toolbox.hpp"
 
+#include <cstdint>
 #include <cstdlib>
 #include <vector>
 #include <cassert>
@@ -125,6 +126,16 @@ class CSysMatrix {
    * size is needed for fast, per-thread, static memory allocation. */
   enum : size_t { MAXNVAR = 20 };
 
+  /*--- Quantized off-diagonal storage. ---*/
+  using QuantType = int8_t;
+  /*! \brief When true, off-diagonal blocks are read from the quantized representation
+   *         during matrix-vector products and LU-SGS; the diagonal block always uses
+   *         the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */
+  static constexpr bool USE_QUANTIZATION = true;
+
+  ScalarType* q_scale;  /*!< \brief Scale of each block row (full precision), [nnz * nVar]. */
+  QuantType* q_offdiag; /*!< \brief Quantized off-diagonal entries per block, [nnz * nVar * nVar]. */
+
   enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */
   enum { OMP_MAX_SIZE_H = 512 };  /*!< \brief Max. chunk size used in heavy parallel for loops. */
   enum { OMP_MIN_SIZE = 32 };     /*!< \brief Chunk size for finer grain operations. */
@@ -372,6 +383,15 @@ class CSysMatrix {
    */
   void RowProduct(const CSysVector<ScalarType>& vec, unsigned long row_i, ScalarType* prod) const;
 
+  /*!
+   * \brief Computes product += A_k * vec using the quantized representation of block k.
+   * \note Only valid after QuantizeOffDiagonalBlocks() has been called.
+   * \param[in] k - Block index in the CSR flat storage.
+   * \param[in] vec - Input vector (nEqn entries).
+   * \param[in,out] prod - Accumulation output (nVar entries).
+   */
+  inline void QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, ScalarType* prod) const;
+
  public:
   /*!
    * \brief Constructor of the class.
@@ -398,6 +418,11 @@ class CSysMatrix {
                   bool EdgeConnect, CGeometry* geometry, const CConfig* config, bool needTranspPtr = false,
                   bool grad_mode = false);
 
+  /*!
+   * \brief Compresses off-diagonal blocks into quantized form for use with USE_QUANTIZATION.
+   */
+  void QuantizeOffDiagonalBlocks();
+
   /*!
    * \brief Sets to zero all the entries of the sparse matrix.
    */
diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index 163e6fb08da..56de5ae428b 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -145,8 +145,18 @@ template <class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i, ScalarType* rhs) const {
   /*--- Copy block, as the algorithm modifies the matrix ---*/
   ScalarType block[MAXNVAR * MAXNVAR];
-  MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);
-
+  if (USE_QUANTIZATION && nVar > 1) {
+    const auto k = dia_ptr[block_i];
+    const QuantType* q = &q_offdiag[k * nVar * nVar];
+    const ScalarType* scale = &q_scale[k * nVar];
+    for (auto r = 0ul; r < nVar; ++r) {
+      for (auto c = 0ul; c < nVar; ++c) {
+        block[r * nVar + c] = scale[r] * q[r * nVar + c];
+      }
+    }
+  } else {
+    MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);
+  }
   Gauss_Elimination(block, rhs);
 }
 
@@ -169,6 +179,18 @@ FORCEINLINE const ScalarType* CSysMatrix<ScalarType>::InvertDiagonalBlockILUMatr
   return Uii;
 }
 
+template <class ScalarType>
+FORCEINLINE void CSysMatrix<ScalarType>::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec,
+                                                            ScalarType* prod) const {
+  const ScalarType* scale = &q_scale[k * nVar];
+  const QuantType* q = &q_offdiag[k * nVar * nVar];
+  for (auto r = 0ul; r < nVar; ++r) {
+    auto sum = ScalarType(0);
+    for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c];
+    prod[r] += scale[r] * sum;
+  }
+}
+
 template <class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType>& vec, unsigned long row_i,
                                                     ScalarType* prod) const {
@@ -176,7 +198,11 @@ FORCEINLINE void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType>
 
   for (auto index = row_ptr[row_i]; index < row_ptr[row_i + 1]; index++) {
     auto col_j = col_ind[index];
-    MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+    if (USE_QUANTIZATION && nVar > 1) {
+      QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
+    } else {
+      MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+    }
   }
 }
 
@@ -188,8 +214,13 @@ FORCEINLINE void CSysMatrix<ScalarType>::UpperProduct(const CSysVector<ScalarTyp
   for (auto index = dia_ptr[row_i] + 1; index < row_ptr[row_i + 1]; index++) {
     auto col_j = col_ind[index];
     /*--- Always include halos. ---*/
-    if (col_j < col_ub || col_j >= nPointDomain)
-      MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+    if (col_j < col_ub || col_j >= nPointDomain) {
+      if (USE_QUANTIZATION && nVar > 1) {
+        QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
+      } else {
+        MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+      }
+    }
   }
 }
 
@@ -200,12 +231,23 @@ FORCEINLINE void CSysMatrix<ScalarType>::LowerProduct(const CSysVector<ScalarTyp
 
   for (auto index = row_ptr[row_i]; index < dia_ptr[row_i]; index++) {
     auto col_j = col_ind[index];
-    if (col_j >= col_lb) MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+    if (col_j >= col_lb) {
+      if (USE_QUANTIZATION && nVar > 1) {
+        QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
+      } else {
+        MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
+      }
+    }
   }
 }
 
 template <class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::DiagonalProduct(const CSysVector<ScalarType>& vec, unsigned long row_i,
                                                          ScalarType* prod) const {
-  MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod);
+  if (USE_QUANTIZATION && nVar > 1) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++) prod[iVar] = 0.0;
+    QuantizedMatVecAdd(dia_ptr[row_i], &vec[row_i * nEqn], prod);
+  } else {
+    MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod);
+  }
 }
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 00ef51c2023..e71ea138468 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -31,6 +31,7 @@
 #include "../../include/toolboxes/allocation_toolbox.hpp"
 
 #include <cmath>
+#include <limits>
 
 template <class ScalarType>
 CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) {
@@ -48,6 +49,9 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind = nullptr;
   col_ptr = nullptr;
 
+  q_offdiag = nullptr;
+  q_scale = nullptr;
+
   ILU_matrix = nullptr;
   row_ptr_ilu = nullptr;
   dia_ptr_ilu = nullptr;
@@ -71,6 +75,8 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
+  MemoryAllocation::aligned_free(q_offdiag);
+  MemoryAllocation::aligned_free(q_scale);
 
   if (useCuda) {
     GPUMemoryAllocation::gpu_free(d_matrix);
@@ -488,6 +494,40 @@ void CSysMatrixComms::Complete(CSysVector<T>& x, CGeometry* geometry, const CCon
 #endif
 }
 
+template <class ScalarType>
+void CSysMatrix<ScalarType>::QuantizeOffDiagonalBlocks() {
+  SU2_ZONE_SCOPED
+
+  if (nVar == 1) return;
+
+  if (q_scale == nullptr) {
+    q_offdiag = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * nVar * sizeof(QuantType));
+    q_scale = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nnz * nVar * sizeof(ScalarType));
+  }
+
+  SU2_OMP_FOR_DYN(omp_heavy_size)
+  for (auto k = 0ul; k < nnz; ++k) {
+    for (auto r = 0ul; r < nVar; ++r) {
+      const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar];
+      ScalarType& __restrict scale = q_scale[k * nVar + r];
+      QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar];
+
+      /*--- Max absolute entry in each row as the scale. ---*/
+      scale = EPS;
+      for (auto c = 0ul; c < nVar; ++c) {
+        scale = fmax(scale, fabs(row[c]));
+      }
+      scale /= std::numeric_limits<QuantType>::max();
+
+      const ScalarType inv_s = 1 / scale;
+      for (auto c = 0ul; c < nVar; ++c) {
+        q_row[c] = static_cast<QuantType>(std::round(SU2_TYPE::GetValue(row[c] * inv_s)));
+      }
+    }
+  }
+  END_SU2_OMP_FOR
+}
+
 template <class ScalarType>
 void CSysMatrix<ScalarType>::SetValZero() {
   SU2_ZONE_SCOPED
@@ -628,7 +668,6 @@ void CSysMatrix<ScalarType>::MatrixInverse(ScalarType* matrix, ScalarType* inver
 
 template <class ScalarType>
 void CSysMatrix<ScalarType>::DeleteValsRowi(unsigned long block_i, unsigned long row) {
-  SU2_ZONE_SCOPED
   for (auto index = row_ptr[block_i]; index < row_ptr[block_i + 1]; index++) {
     for (auto iVar = 0u; iVar < nVar; iVar++)
       matrix[index * nVar * nVar + row * nVar + iVar] = 0.0;  // Delete row values in the block
diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
index 05ec25a0e56..87b158e45b3 100644
--- a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
+++ b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
@@ -649,6 +649,7 @@ void CFVMFlowSolverBase<V, R>::ImplicitEuler_Iteration(CGeometry *geometry, CSol
   }
   END_SU2_OMP_FOR
 
+  Jacobian.QuantizeOffDiagonalBlocks();
   auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config);
 
   BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {

From 9d44c54e8bd9943ec8296c0c71560239507c2d50 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 27 Jun 2026 11:27:21 -0700
Subject: [PATCH 2/6] [skip ci]


From 5e162270a14a7fdf22c2c1395b2c2fe4525da4e9 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 27 Jun 2026 11:32:24 -0700
Subject: [PATCH 3/6] [skip ci]

---
 SU2_CFD/include/solvers/CScalarSolver.inl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SU2_CFD/include/solvers/CScalarSolver.inl b/SU2_CFD/include/solvers/CScalarSolver.inl
index 7d6871f8ca4..5847879f7b5 100644
--- a/SU2_CFD/include/solvers/CScalarSolver.inl
+++ b/SU2_CFD/include/solvers/CScalarSolver.inl
@@ -572,6 +572,7 @@ void CScalarSolver<VariableType>::ImplicitEuler_Iteration(CGeometry* geometry, C
   }
   END_SU2_OMP_FOR
 
+  Jacobian.QuantizeOffDiagonalBlocks();
   auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config);
 
   BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {

From 1e42de1ba374313b0c03846d06e32fb742e72b5f Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 27 Jun 2026 20:49:51 -0700
Subject: [PATCH 4/6] full crazy

---
 Common/include/linear_algebra/CSysMatrix.hpp |  5 +--
 Common/include/linear_algebra/CSysMatrix.inl | 34 ++++++++++++++----
 Common/src/linear_algebra/CSysMatrix.cpp     | 37 +++++++++++++-------
 3 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 2104f6b8908..1b6beaff213 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -133,8 +133,9 @@ class CSysMatrix {
    *         the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */
   static constexpr bool USE_QUANTIZATION = true;
 
-  ScalarType* q_scale;  /*!< \brief Scale of each block row (full precision), [nnz * nVar]. */
-  QuantType* q_offdiag; /*!< \brief Quantized off-diagonal entries per block, [nnz * nVar * nVar]. */
+  ScalarType* q_bscale; /*!< \brief Per-block scale (full precision), [nnz]. */
+  uint8_t* q_scale;     /*!< \brief Per-row scale relative to q_bscale as custom float8 (5e|3s), [nnz * nVar]. */
+  QuantType* q_offdiag; /*!< \brief Quantized entries per block, [nnz * nVar * nVar]. */
 
   enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */
   enum { OMP_MAX_SIZE_H = 512 };  /*!< \brief Max. chunk size used in heavy parallel for loops. */
diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index 56de5ae428b..f6e0b6a65dc 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -82,6 +82,24 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) {
     }
   }
 }
+/*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e).
+ *    Exponent is always non-negative so all values are in (0, ~1.875].
+ *    No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/
+inline uint8_t float8_encode(double v) {
+  if (v <= 0.0) return static_cast<uint8_t>(31 << 3);  // clamp to minimum
+  int fe;
+  const double m = std::frexp(v, &fe);
+  int e = 1 - fe;
+  int s = static_cast<int>(std::round((2.0 * m - 1.0) * 8.0));
+  if (s >= 8) {
+    --e;
+    s = 0;
+  }  // carry: move to next higher power-of-two band (larger value → smaller e)
+  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | std::max(0, std::min(7, s)));
+}
+
+inline double float8_decode(uint8_t b) { return std::ldexp(1.0 + (b & 7) * 0.125, -(b >> 3)); }
+
 }  // namespace
 
 #define __MATVECPROD_SIGNATURE__(TYPE, NAME) \
@@ -147,12 +165,12 @@ FORCEINLINE void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i
   ScalarType block[MAXNVAR * MAXNVAR];
   if (USE_QUANTIZATION && nVar > 1) {
     const auto k = dia_ptr[block_i];
+    const ScalarType bscale = q_bscale[k];
+    const uint8_t* qscale = &q_scale[k * nVar];
     const QuantType* q = &q_offdiag[k * nVar * nVar];
-    const ScalarType* scale = &q_scale[k * nVar];
     for (auto r = 0ul; r < nVar; ++r) {
-      for (auto c = 0ul; c < nVar; ++c) {
-        block[r * nVar + c] = scale[r] * q[r * nVar + c];
-      }
+      const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r]));
+      for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * static_cast<ScalarType>(q[r * nVar + c]);
     }
   } else {
     MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);
@@ -182,12 +200,14 @@ FORCEINLINE const ScalarType* CSysMatrix<ScalarType>::InvertDiagonalBlockILUMatr
 template <class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec,
                                                             ScalarType* prod) const {
-  const ScalarType* scale = &q_scale[k * nVar];
+  const ScalarType bscale = q_bscale[k];
+  const uint8_t* qscale = &q_scale[k * nVar];
   const QuantType* q = &q_offdiag[k * nVar * nVar];
   for (auto r = 0ul; r < nVar; ++r) {
+    const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r]));
     auto sum = ScalarType(0);
-    for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c];
-    prod[r] += scale[r] * sum;
+    for (auto c = 0ul; c < nVar; ++c) sum += static_cast<ScalarType>(q[r * nVar + c]) * vec[c];
+    prod[r] += row_scale * sum;
   }
 }
 
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index e71ea138468..86f7a752c00 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -49,6 +49,7 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind = nullptr;
   col_ptr = nullptr;
 
+  q_bscale = nullptr;
   q_offdiag = nullptr;
   q_scale = nullptr;
 
@@ -75,6 +76,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
+  MemoryAllocation::aligned_free(q_bscale);
   MemoryAllocation::aligned_free(q_offdiag);
   MemoryAllocation::aligned_free(q_scale);
 
@@ -500,28 +502,39 @@ void CSysMatrix<ScalarType>::QuantizeOffDiagonalBlocks() {
 
   if (nVar == 1) return;
 
-  if (q_scale == nullptr) {
+  if (q_bscale == nullptr) {
+    q_bscale = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nnz * sizeof(ScalarType));
+    q_scale = MemoryAllocation::aligned_alloc<uint8_t, true>(64, nnz * nVar * sizeof(uint8_t));
     q_offdiag = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * nVar * sizeof(QuantType));
-    q_scale = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nnz * nVar * sizeof(ScalarType));
   }
+  constexpr double q_max = std::numeric_limits<QuantType>::max();
 
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (auto k = 0ul; k < nnz; ++k) {
+    const ScalarType* __restrict blk = &matrix[k * nVar * nVar];
+
+    /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/
+    double max_blk = EPS;
+    for (auto idx = 0ul; idx < nVar * nVar; ++idx) max_blk = std::max(max_blk, std::abs(SU2_TYPE::GetValue(blk[idx])));
+    q_bscale[k] = ScalarType(max_blk / q_max);
+
     for (auto r = 0ul; r < nVar; ++r) {
-      const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar];
-      ScalarType& __restrict scale = q_scale[k * nVar + r];
+      const ScalarType* __restrict row = &blk[r * nVar];
       QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar];
 
-      /*--- Max absolute entry in each row as the scale. ---*/
-      scale = EPS;
-      for (auto c = 0ul; c < nVar; ++c) {
-        scale = fmax(scale, fabs(row[c]));
-      }
-      scale /= std::numeric_limits<QuantType>::max();
+      /*--- Per-row max → encode ratio to block max as float8. ---*/
+      double max_row = 0.0;
+      for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, std::abs(SU2_TYPE::GetValue(row[c])));
+      q_scale[k * nVar + r] = float8_encode(max_row / max_blk);
+
+      /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/
+      const double eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]);
+      const ScalarType inv_rscale = (eff > 0.0) ? ScalarType(1.0 / eff) : ScalarType(0);
 
-      const ScalarType inv_s = 1 / scale;
       for (auto c = 0ul; c < nVar; ++c) {
-        q_row[c] = static_cast<QuantType>(std::round(SU2_TYPE::GetValue(row[c] * inv_s)));
+        const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale));
+        q_row[c] = static_cast<QuantType>(std::max(double(std::numeric_limits<QuantType>::min()),
+                                                   std::min(double(std::numeric_limits<QuantType>::max()), qval)));
       }
     }
   }

From 14116a4815ab952d55c9b252efdcb412be33d345 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 27 Jun 2026 21:15:31 -0700
Subject: [PATCH 5/6] micro optimize

---
 Common/include/linear_algebra/CSysMatrix.inl | 27 ++++++++++++++------
 Common/src/linear_algebra/CSysMatrix.cpp     | 17 ++++++------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index f6e0b6a65dc..217304b0064 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -85,20 +85,31 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) {
 /*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e).
  *    Exponent is always non-negative so all values are in (0, ~1.875].
  *    No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/
+inline uint8_t float8_encode(float v) {
+  uint32_t bits;
+  memcpy(&bits, &v, sizeof(bits));
+  int e = 127 - static_cast<int>((bits >> 23) & 0xFFu);
+  int s = static_cast<int>((bits >> 20) & 7u) + static_cast<int>((bits >> 19) & 1u);
+  if (s >= 8) {
+    --e;
+    s = 0;
+  }
+  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | s);
+}
+
 inline uint8_t float8_encode(double v) {
-  if (v <= 0.0) return static_cast<uint8_t>(31 << 3);  // clamp to minimum
-  int fe;
-  const double m = std::frexp(v, &fe);
-  int e = 1 - fe;
-  int s = static_cast<int>(std::round((2.0 * m - 1.0) * 8.0));
+  uint64_t bits;
+  memcpy(&bits, &v, sizeof(bits));
+  int e = 1023 - static_cast<int>((bits >> 52) & 0x7FFu);
+  int s = static_cast<int>((bits >> 49) & 7u) + static_cast<int>((bits >> 48) & 1u);
   if (s >= 8) {
     --e;
     s = 0;
-  }  // carry: move to next higher power-of-two band (larger value → smaller e)
-  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | std::max(0, std::min(7, s)));
+  }
+  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | s);
 }
 
-inline double float8_decode(uint8_t b) { return std::ldexp(1.0 + (b & 7) * 0.125, -(b >> 3)); }
+inline float float8_decode(uint8_t b) { return std::ldexp(1.0f + (b & 7) * 0.125f, -(b >> 3)); }
 
 }  // namespace
 
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 86f7a752c00..6a6a4279eed 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -507,29 +507,30 @@ void CSysMatrix<ScalarType>::QuantizeOffDiagonalBlocks() {
     q_scale = MemoryAllocation::aligned_alloc<uint8_t, true>(64, nnz * nVar * sizeof(uint8_t));
     q_offdiag = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * nVar * sizeof(QuantType));
   }
-  constexpr double q_max = std::numeric_limits<QuantType>::max();
+  const ScalarType q_max = std::numeric_limits<QuantType>::max();
 
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (auto k = 0ul; k < nnz; ++k) {
     const ScalarType* __restrict blk = &matrix[k * nVar * nVar];
 
     /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/
-    double max_blk = EPS;
-    for (auto idx = 0ul; idx < nVar * nVar; ++idx) max_blk = std::max(max_blk, std::abs(SU2_TYPE::GetValue(blk[idx])));
-    q_bscale[k] = ScalarType(max_blk / q_max);
+    ScalarType max_blk = EPS;
+    for (auto idx = 0ul; idx < nVar * nVar; ++idx)
+      max_blk = std::max(max_blk, ScalarType(std::abs(SU2_TYPE::GetValue(blk[idx]))));
+    q_bscale[k] = max_blk / q_max;
 
     for (auto r = 0ul; r < nVar; ++r) {
       const ScalarType* __restrict row = &blk[r * nVar];
       QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar];
 
       /*--- Per-row max → encode ratio to block max as float8. ---*/
-      double max_row = 0.0;
-      for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, std::abs(SU2_TYPE::GetValue(row[c])));
+      auto max_row = ScalarType(0);
+      for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, ScalarType(std::abs(SU2_TYPE::GetValue(row[c]))));
       q_scale[k * nVar + r] = float8_encode(max_row / max_blk);
 
       /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/
-      const double eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]);
-      const ScalarType inv_rscale = (eff > 0.0) ? ScalarType(1.0 / eff) : ScalarType(0);
+      const ScalarType eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]);
+      const ScalarType inv_rscale = (eff > ScalarType(0)) ? ScalarType(1) / eff : ScalarType(0);
 
       for (auto c = 0ul; c < nVar; ++c) {
         const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale));

From c86a8a23e784e43ff96759b7918ed34a3c9c430d Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sun, 28 Jun 2026 18:10:20 -0700
Subject: [PATCH 6/6] compress more [skip ci]

---
 .../basic_types/datatype_structure.hpp        |  4 ++
 Common/include/linear_algebra/CSysMatrix.hpp  |  3 +-
 Common/include/linear_algebra/CSysMatrix.inl  | 49 ++++---------------
 Common/src/linear_algebra/CSysMatrix.cpp      | 47 +++++++++---------
 Common/src/linear_algebra/CSysSolve.cpp       | 12 +++--
 5 files changed, 45 insertions(+), 70 deletions(-)

diff --git a/Common/include/basic_types/datatype_structure.hpp b/Common/include/basic_types/datatype_structure.hpp
index 623c410e551..d9b6bb0b58f 100644
--- a/Common/include/basic_types/datatype_structure.hpp
+++ b/Common/include/basic_types/datatype_structure.hpp
@@ -135,6 +135,10 @@ template <>
 struct Passive<su2double> {
   FORCEINLINE static passivedouble Value(const su2double& val) { return GetValue(val); }
 };
+template <class T>
+FORCEINLINE auto PassiveValue(const T& val) {
+  return Passive<T>::Value(val);
+}
 
 /*!
  * \brief Casts the primitive value to int (uses GetValue, already implemented for each type).
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 1b6beaff213..1d06f457d43 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -133,8 +133,7 @@ class CSysMatrix {
    *         the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */
   static constexpr bool USE_QUANTIZATION = true;
 
-  ScalarType* q_bscale; /*!< \brief Per-block scale (full precision), [nnz]. */
-  uint8_t* q_scale;     /*!< \brief Per-row scale relative to q_bscale as custom float8 (5e|3s), [nnz * nVar]. */
+  QuantType* q_scale;   /*!< \brief Per-row signed binary exponent e; scale = 2^e, [nnz * nVar]. */
   QuantType* q_offdiag; /*!< \brief Quantized entries per block, [nnz * nVar * nVar]. */
 
   enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */
diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index 217304b0064..713d3db1a4b 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -82,35 +82,6 @@ FORCEINLINE void gemm_impl(unsigned long n, const T* a, const T* b, T* c) {
     }
   }
 }
-/*--- Custom 8-bit float for per-row scales: layout [eeeee|sss], value = (1 + s/8) * 2^(-e).
- *    Exponent is always non-negative so all values are in (0, ~1.875].
- *    No sign, no zero — used to encode ratios in (0, 1] relative to the block peak. ---*/
-inline uint8_t float8_encode(float v) {
-  uint32_t bits;
-  memcpy(&bits, &v, sizeof(bits));
-  int e = 127 - static_cast<int>((bits >> 23) & 0xFFu);
-  int s = static_cast<int>((bits >> 20) & 7u) + static_cast<int>((bits >> 19) & 1u);
-  if (s >= 8) {
-    --e;
-    s = 0;
-  }
-  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | s);
-}
-
-inline uint8_t float8_encode(double v) {
-  uint64_t bits;
-  memcpy(&bits, &v, sizeof(bits));
-  int e = 1023 - static_cast<int>((bits >> 52) & 0x7FFu);
-  int s = static_cast<int>((bits >> 49) & 7u) + static_cast<int>((bits >> 48) & 1u);
-  if (s >= 8) {
-    --e;
-    s = 0;
-  }
-  return static_cast<uint8_t>((std::max(0, std::min(31, e)) << 3) | s);
-}
-
-inline float float8_decode(uint8_t b) { return std::ldexp(1.0f + (b & 7) * 0.125f, -(b >> 3)); }
-
 }  // namespace
 
 #define __MATVECPROD_SIGNATURE__(TYPE, NAME) \
@@ -176,12 +147,13 @@ FORCEINLINE void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i
   ScalarType block[MAXNVAR * MAXNVAR];
   if (USE_QUANTIZATION && nVar > 1) {
     const auto k = dia_ptr[block_i];
-    const ScalarType bscale = q_bscale[k];
-    const uint8_t* qscale = &q_scale[k * nVar];
-    const QuantType* q = &q_offdiag[k * nVar * nVar];
+    const QuantType* __restrict qscale = &q_scale[k * nVar];
+    const QuantType* __restrict q = &q_offdiag[k * nVar * nVar];
     for (auto r = 0ul; r < nVar; ++r) {
-      const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r]));
-      for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * static_cast<ScalarType>(q[r * nVar + c]);
+      const uint32_t rs_bits = static_cast<uint32_t>(std::max(0, static_cast<int>(qscale[r]) + 127)) << 23;
+      float row_scale;
+      memcpy(&row_scale, &rs_bits, sizeof(rs_bits));
+      for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * q[r * nVar + c];
     }
   } else {
     MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);
@@ -211,13 +183,12 @@ FORCEINLINE const ScalarType* CSysMatrix<ScalarType>::InvertDiagonalBlockILUMatr
 template <class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec,
                                                             ScalarType* prod) const {
-  const ScalarType bscale = q_bscale[k];
-  const uint8_t* qscale = &q_scale[k * nVar];
-  const QuantType* q = &q_offdiag[k * nVar * nVar];
+  const QuantType* __restrict qscale = &q_scale[k * nVar];
+  const QuantType* __restrict q = &q_offdiag[k * nVar * nVar];
   for (auto r = 0ul; r < nVar; ++r) {
-    const ScalarType row_scale = bscale * ScalarType(float8_decode(qscale[r]));
+    const auto row_scale = ldexpf(1, static_cast<int>(qscale[r]));
     auto sum = ScalarType(0);
-    for (auto c = 0ul; c < nVar; ++c) sum += static_cast<ScalarType>(q[r * nVar + c]) * vec[c];
+    for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c];
     prod[r] += row_scale * sum;
   }
 }
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 6a6a4279eed..95e15f76667 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -49,7 +49,6 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind = nullptr;
   col_ptr = nullptr;
 
-  q_bscale = nullptr;
   q_offdiag = nullptr;
   q_scale = nullptr;
 
@@ -76,7 +75,6 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
-  MemoryAllocation::aligned_free(q_bscale);
   MemoryAllocation::aligned_free(q_offdiag);
   MemoryAllocation::aligned_free(q_scale);
 
@@ -502,40 +500,39 @@ void CSysMatrix<ScalarType>::QuantizeOffDiagonalBlocks() {
 
   if (nVar == 1) return;
 
-  if (q_bscale == nullptr) {
-    q_bscale = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nnz * sizeof(ScalarType));
-    q_scale = MemoryAllocation::aligned_alloc<uint8_t, true>(64, nnz * nVar * sizeof(uint8_t));
+  if (q_scale == nullptr) {
+    q_scale = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * sizeof(QuantType));
     q_offdiag = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * nVar * sizeof(QuantType));
   }
-  const ScalarType q_max = std::numeric_limits<QuantType>::max();
 
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (auto k = 0ul; k < nnz; ++k) {
-    const ScalarType* __restrict blk = &matrix[k * nVar * nVar];
-
-    /*--- Per-block max absolute value → block scale: q_bscale * INT8_MAX ≈ max_blk. ---*/
-    ScalarType max_blk = EPS;
-    for (auto idx = 0ul; idx < nVar * nVar; ++idx)
-      max_blk = std::max(max_blk, ScalarType(std::abs(SU2_TYPE::GetValue(blk[idx]))));
-    q_bscale[k] = max_blk / q_max;
-
     for (auto r = 0ul; r < nVar; ++r) {
-      const ScalarType* __restrict row = &blk[r * nVar];
+      const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar];
       QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar];
 
-      /*--- Per-row max → encode ratio to block max as float8. ---*/
-      auto max_row = ScalarType(0);
-      for (auto c = 0ul; c < nVar; ++c) max_row = std::max(max_row, ScalarType(std::abs(SU2_TYPE::GetValue(row[c]))));
-      q_scale[k * nVar + r] = float8_encode(max_row / max_blk);
+      /*--- Integer max over sign-cleared float bits gives max-abs with no float compare.
+       *    The biased exponent is then free — no extra extraction step needed. ---*/
+      constexpr uint32_t eps_bits = 0x34000000u;  // FLT_EPSILON ≈ 1.19e-7, floor for degenerate rows
+      uint32_t max_abs_bits = eps_bits;
+      for (auto c = 0ul; c < nVar; ++c) {
+        uint32_t fb;
+        const float fv = SU2_TYPE::PassiveValue(row[c]);
+        memcpy(&fb, &fv, sizeof(fb));
+        max_abs_bits = std::max(max_abs_bits, fb & 0x7FFFFFFFu);
+      }
 
-      /*--- Effective row scale = q_bscale * float8(ratio) ≈ max_row / INT8_MAX. ---*/
-      const ScalarType eff = max_blk / q_max * float8_decode(q_scale[k * nVar + r]);
-      const ScalarType inv_rscale = (eff > ScalarType(0)) ? ScalarType(1) / eff : ScalarType(0);
+      /*--- e = biased_exp - 133 puts quantized max-abs in [64,128). ---*/
+      const int e_clamped = std::min(127, std::max(-128, static_cast<int>(max_abs_bits >> 23) - 133));
+      q_scale[k * nVar + r] = static_cast<QuantType>(e_clamped);
 
+      /*--- 2^(-e) is a float with biased_exp = (127-e), zero mantissa — no ldexpf needed. ---*/
+      const uint32_t inv_bits = static_cast<uint32_t>(127 - e_clamped) << 23;
+      float inv_rscale;
+      memcpy(&inv_rscale, &inv_bits, sizeof(inv_rscale));
       for (auto c = 0ul; c < nVar; ++c) {
-        const double qval = std::round(SU2_TYPE::GetValue(row[c]) * SU2_TYPE::GetValue(inv_rscale));
-        q_row[c] = static_cast<QuantType>(std::max(double(std::numeric_limits<QuantType>::min()),
-                                                   std::min(double(std::numeric_limits<QuantType>::max()), qval)));
+        q_row[c] = static_cast<QuantType>(
+            std::max(-128.f, std::min(127.f, roundf(SU2_TYPE::PassiveValue(row[c]) * inv_rscale))));
       }
     }
   }
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index cd100f311ab..2a58215d2f8 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -212,11 +212,15 @@ bool CSysSolve<ScalarType>::ModGramSchmidt(bool shared_hsbg, int i, su2matrix<Sc
   LinearCombination(
       shared_hsbg, i + 1, w, [&h_i](int k) { return -h_i(0, k); }, w[i + 1], true);
 
-  const auto& dh_i = CSysVector<ScalarType>::multiDot(w, i + 1, 1, w, i + 1);
-  LinearCombination(
-      shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true);
+  if (i < 5) {
+    for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k));
+  } else {
+    const auto& dh_i = CSysVector<ScalarType>::multiDot(w, i + 1, 1, w, i + 1);
+    LinearCombination(
+        shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true);
 
-  for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k));
+    for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k));
+  }
 
   /*--- The norm of w[i+1] is 0 or NaN: the input vector from mat_vec is
    * zero or contains NaN. Cannot proceed with orthogonalization. ---*/