Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Common/include/basic_types/datatype_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ template <>
struct Passive<su2double> {
FORCEINLINE static passivedouble Value(const su2double& val) { return GetValue(val); }
};
template <class T>
FORCEINLINE auto PassiveValue(const T& val) {
return Passive<T>::Value(val);
}

/*!
* \brief Casts the primitive value to int (uses GetValue, already implemented for each type).
Expand Down
25 changes: 25 additions & 0 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "CPastixWrapper.hpp"
#include "../toolboxes/graph_toolbox.hpp"

#include <cstdint>
#include <cstdlib>
#include <vector>
#include <cassert>
Expand Down Expand Up @@ -125,6 +126,16 @@ class CSysMatrix {
* size is needed for fast, per-thread, static memory allocation. */
enum : size_t { MAXNVAR = 20 };

/*--- Quantized off-diagonal storage. ---*/
using QuantType = int8_t;
/*! \brief When true, off-diagonal blocks are read from the quantized representation
* during matrix-vector products and LU-SGS; the diagonal block always uses
* the original full-precision storage. Call QuantizeOffDiagonalBlocks() first. */
static constexpr bool USE_QUANTIZATION = true;

QuantType* q_scale; /*!< \brief Per-row signed binary exponent e; scale = 2^e, [nnz * nVar]. */
QuantType* q_offdiag; /*!< \brief Quantized entries per block, [nnz * nVar * nVar]. */

enum { OMP_MAX_SIZE_L = 8192 }; /*!< \brief Max. chunk size used in light parallel for loops. */
enum { OMP_MAX_SIZE_H = 512 }; /*!< \brief Max. chunk size used in heavy parallel for loops. */
enum { OMP_MIN_SIZE = 32 }; /*!< \brief Chunk size for finer grain operations. */
Expand Down Expand Up @@ -372,6 +383,15 @@ class CSysMatrix {
*/
void RowProduct(const CSysVector<ScalarType>& vec, unsigned long row_i, ScalarType* prod) const;

/*!
* \brief Computes product += A_k * vec using the quantized representation of block k.
* \note Only valid after QuantizeOffDiagonalBlocks() has been called.
* \param[in] k - Block index in the CSR flat storage.
* \param[in] vec - Input vector (nEqn entries).
* \param[in,out] prod - Accumulation output (nVar entries).
*/
inline void QuantizedMatVecAdd(unsigned long k, const ScalarType* vec, ScalarType* prod) const;

public:
/*!
* \brief Constructor of the class.
Expand All @@ -398,6 +418,11 @@ class CSysMatrix {
bool EdgeConnect, CGeometry* geometry, const CConfig* config, bool needTranspPtr = false,
bool grad_mode = false);

/*!
* \brief Compresses off-diagonal blocks into quantized form for use with USE_QUANTIZATION.
*/
void QuantizeOffDiagonalBlocks();

/*!
* \brief Sets to zero all the entries of the sparse matrix.
*/
Expand Down
58 changes: 51 additions & 7 deletions Common/include/linear_algebra/CSysMatrix.inl
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,19 @@ template <class ScalarType>
FORCEINLINE void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i, ScalarType* rhs) const {
/*--- Copy block, as the algorithm modifies the matrix ---*/
ScalarType block[MAXNVAR * MAXNVAR];
MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);

if (USE_QUANTIZATION && nVar > 1) {
const auto k = dia_ptr[block_i];
const QuantType* __restrict qscale = &q_scale[k * nVar];
const QuantType* __restrict q = &q_offdiag[k * nVar * nVar];
for (auto r = 0ul; r < nVar; ++r) {
const uint32_t rs_bits = static_cast<uint32_t>(std::max(0, static_cast<int>(qscale[r]) + 127)) << 23;
float row_scale;
memcpy(&row_scale, &rs_bits, sizeof(rs_bits));
for (auto c = 0ul; c < nVar; ++c) block[r * nVar + c] = row_scale * q[r * nVar + c];
}
} else {
MatrixCopy(&matrix[dia_ptr[block_i] * nVar * nVar], block);
}
Gauss_Elimination(block, rhs);
}

Expand All @@ -169,14 +180,31 @@ FORCEINLINE const ScalarType* CSysMatrix<ScalarType>::InvertDiagonalBlockILUMatr
return Uii;
}

template <class ScalarType>
FORCEINLINE void CSysMatrix<ScalarType>::QuantizedMatVecAdd(unsigned long k, const ScalarType* vec,
ScalarType* prod) const {
const QuantType* __restrict qscale = &q_scale[k * nVar];
const QuantType* __restrict q = &q_offdiag[k * nVar * nVar];
for (auto r = 0ul; r < nVar; ++r) {
const auto row_scale = ldexpf(1, static_cast<int>(qscale[r]));
auto sum = ScalarType(0);
for (auto c = 0ul; c < nVar; ++c) sum += q[r * nVar + c] * vec[c];
prod[r] += row_scale * sum;
}
}

template <class ScalarType>
FORCEINLINE void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType>& vec, unsigned long row_i,
ScalarType* prod) const {
for (auto iVar = 0ul; iVar < nVar; iVar++) prod[iVar] = 0.0;

for (auto index = row_ptr[row_i]; index < row_ptr[row_i + 1]; index++) {
auto col_j = col_ind[index];
MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
if (USE_QUANTIZATION && nVar > 1) {
QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
} else {
MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
}
}
}

Expand All @@ -188,8 +216,13 @@ FORCEINLINE void CSysMatrix<ScalarType>::UpperProduct(const CSysVector<ScalarTyp
for (auto index = dia_ptr[row_i] + 1; index < row_ptr[row_i + 1]; index++) {
auto col_j = col_ind[index];
/*--- Always include halos. ---*/
if (col_j < col_ub || col_j >= nPointDomain)
MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
if (col_j < col_ub || col_j >= nPointDomain) {
if (USE_QUANTIZATION && nVar > 1) {
QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
} else {
MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
}
}
}
}

Expand All @@ -200,12 +233,23 @@ FORCEINLINE void CSysMatrix<ScalarType>::LowerProduct(const CSysVector<ScalarTyp

for (auto index = row_ptr[row_i]; index < dia_ptr[row_i]; index++) {
auto col_j = col_ind[index];
if (col_j >= col_lb) MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
if (col_j >= col_lb) {
if (USE_QUANTIZATION && nVar > 1) {
QuantizedMatVecAdd(index, &vec[col_j * nEqn], prod);
} else {
MatrixVectorProductAdd(&matrix[index * nVar * nEqn], &vec[col_j * nEqn], prod);
}
}
}
}

template <class ScalarType>
FORCEINLINE void CSysMatrix<ScalarType>::DiagonalProduct(const CSysVector<ScalarType>& vec, unsigned long row_i,
ScalarType* prod) const {
MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod);
if (USE_QUANTIZATION && nVar > 1) {
for (auto iVar = 0ul; iVar < nVar; iVar++) prod[iVar] = 0.0;
QuantizedMatVecAdd(dia_ptr[row_i], &vec[row_i * nEqn], prod);
} else {
MatrixVectorProduct(&matrix[dia_ptr[row_i] * nVar * nEqn], &vec[row_i * nEqn], prod);
}
}
52 changes: 51 additions & 1 deletion Common/src/linear_algebra/CSysMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "../../include/toolboxes/allocation_toolbox.hpp"

#include <cmath>
#include <limits>

template <class ScalarType>
CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) {
Expand All @@ -48,6 +49,9 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
col_ind = nullptr;
col_ptr = nullptr;

q_offdiag = nullptr;
q_scale = nullptr;

ILU_matrix = nullptr;
row_ptr_ilu = nullptr;
dia_ptr_ilu = nullptr;
Expand All @@ -71,6 +75,8 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
MemoryAllocation::aligned_free(ILU_matrix);
MemoryAllocation::aligned_free(matrix);
MemoryAllocation::aligned_free(invM);
MemoryAllocation::aligned_free(q_offdiag);
MemoryAllocation::aligned_free(q_scale);

if (useCuda) {
GPUMemoryAllocation::gpu_free(d_matrix);
Expand Down Expand Up @@ -488,6 +494,51 @@ void CSysMatrixComms::Complete(CSysVector<T>& x, CGeometry* geometry, const CCon
#endif
}

template <class ScalarType>
void CSysMatrix<ScalarType>::QuantizeOffDiagonalBlocks() {
SU2_ZONE_SCOPED

if (nVar == 1) return;

if (q_scale == nullptr) {
q_scale = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * sizeof(QuantType));
q_offdiag = MemoryAllocation::aligned_alloc<QuantType, true>(64, nnz * nVar * nVar * sizeof(QuantType));
}

SU2_OMP_FOR_DYN(omp_heavy_size)
for (auto k = 0ul; k < nnz; ++k) {
for (auto r = 0ul; r < nVar; ++r) {
const ScalarType* __restrict row = &matrix[(k * nVar + r) * nVar];
QuantType* __restrict q_row = &q_offdiag[(k * nVar + r) * nVar];

/*--- Integer max over sign-cleared float bits gives max-abs with no float compare.
* The biased exponent is then free — no extra extraction step needed. ---*/
constexpr uint32_t eps_bits = 0x34000000u; // FLT_EPSILON ≈ 1.19e-7, floor for degenerate rows
uint32_t max_abs_bits = eps_bits;
for (auto c = 0ul; c < nVar; ++c) {
uint32_t fb;
const float fv = SU2_TYPE::PassiveValue(row[c]);
memcpy(&fb, &fv, sizeof(fb));
max_abs_bits = std::max(max_abs_bits, fb & 0x7FFFFFFFu);
}

/*--- e = biased_exp - 133 puts quantized max-abs in [64,128). ---*/
const int e_clamped = std::min(127, std::max(-128, static_cast<int>(max_abs_bits >> 23) - 133));
q_scale[k * nVar + r] = static_cast<QuantType>(e_clamped);

/*--- 2^(-e) is a float with biased_exp = (127-e), zero mantissa — no ldexpf needed. ---*/
const uint32_t inv_bits = static_cast<uint32_t>(127 - e_clamped) << 23;
float inv_rscale;
memcpy(&inv_rscale, &inv_bits, sizeof(inv_rscale));
for (auto c = 0ul; c < nVar; ++c) {
q_row[c] = static_cast<QuantType>(
std::max(-128.f, std::min(127.f, roundf(SU2_TYPE::PassiveValue(row[c]) * inv_rscale))));
}
}
}
END_SU2_OMP_FOR
}

template <class ScalarType>
void CSysMatrix<ScalarType>::SetValZero() {
SU2_ZONE_SCOPED
Expand Down Expand Up @@ -628,7 +679,6 @@ void CSysMatrix<ScalarType>::MatrixInverse(ScalarType* matrix, ScalarType* inver

template <class ScalarType>
void CSysMatrix<ScalarType>::DeleteValsRowi(unsigned long block_i, unsigned long row) {
SU2_ZONE_SCOPED
for (auto index = row_ptr[block_i]; index < row_ptr[block_i + 1]; index++) {
for (auto iVar = 0u; iVar < nVar; iVar++)
matrix[index * nVar * nVar + row * nVar + iVar] = 0.0; // Delete row values in the block
Expand Down
12 changes: 8 additions & 4 deletions Common/src/linear_algebra/CSysSolve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,15 @@ bool CSysSolve<ScalarType>::ModGramSchmidt(bool shared_hsbg, int i, su2matrix<Sc
LinearCombination(
shared_hsbg, i + 1, w, [&h_i](int k) { return -h_i(0, k); }, w[i + 1], true);

const auto& dh_i = CSysVector<ScalarType>::multiDot(w, i + 1, 1, w, i + 1);
LinearCombination(
shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true);
if (i < 5) {
for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k));
} else {
const auto& dh_i = CSysVector<ScalarType>::multiDot(w, i + 1, 1, w, i + 1);
LinearCombination(
shared_hsbg, i + 1, w, [&dh_i](int k) { return -dh_i(0, k); }, w[i + 1], true);

for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k));
for (int k = 0; k < i + 1; k++) SetHsbg(k, i, h_i(0, k) + dh_i(0, k));
}

/*--- The norm of w[i+1] is 0 or NaN: the input vector from mat_vec is
* zero or contains NaN. Cannot proceed with orthogonalization. ---*/
Expand Down
1 change: 1 addition & 0 deletions SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ void CFVMFlowSolverBase<V, R>::ImplicitEuler_Iteration(CGeometry *geometry, CSol
}
END_SU2_OMP_FOR

Jacobian.QuantizeOffDiagonalBlocks();
auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config);

BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
Expand Down
1 change: 1 addition & 0 deletions SU2_CFD/include/solvers/CScalarSolver.inl
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,7 @@ void CScalarSolver<VariableType>::ImplicitEuler_Iteration(CGeometry* geometry, C
}
END_SU2_OMP_FOR

Jacobian.QuantizeOffDiagonalBlocks();
auto iter = System.Solve(Jacobian, LinSysRes, LinSysSol, geometry, config);

BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
Expand Down