From 18438c85079af7dc4dc0c6aef1dd1bc544ac14cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=BD=A6=E9=9C=96?=
 <16507150+liu-yanlin0518@user.noreply.gitee.com>
Date: Sat, 30 May 2026 20:52:43 +0800
Subject: [PATCH 1/2] ppcg method

---
 source/source_hsolver/CMakeLists.txt          |   1 +
 source/source_hsolver/diago_ppcg.cpp          | 542 ++++++++++++++++++
 source/source_hsolver/diago_ppcg.h            | 201 +++++++
 source/source_hsolver/test/CMakeLists.txt     |  11 +
 .../test/diago_ppcg_perf_test.cpp             | 284 +++++++++
 5 files changed, 1039 insertions(+)
 create mode 100644 source/source_hsolver/diago_ppcg.cpp
 create mode 100644 source/source_hsolver/diago_ppcg.h
 create mode 100644 source/source_hsolver/test/diago_ppcg_perf_test.cpp
diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt
index b115d6d4cd2..6b364562a04 100644
--- a/source/source_hsolver/CMakeLists.txt
+++ b/source/source_hsolver/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND objects
     diago_pxxxgvx.cpp
     diag_hs_para.cpp
     diago_params.cpp
+    diago_ppcg.cpp
 
 )
 
diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp
new file mode 100644
index 00000000000..fc49be83679
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.cpp
@@ -0,0 +1,542 @@
+#include "source_hsolver/diago_ppcg.h"
+
+#include "diago_iter_assist.h"
+#include "para_linear_transform.h"
+#include "source_base/global_function.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/parallel_comm.h"
+
+#include <ATen/kernels/blas.h>
+#include <ATen/kernels/lapack.h>
+#include <cstring>
+#include <limits>
+
+namespace hsolver
+{
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::DiagoPPCG(const Real* precondition_in)
+{
+    this->r_type = ct::DataTypeToEnum<Real>::value;
+    this->t_type = ct::DataTypeToEnum<T>::value;
+    this->device_type = ct::DeviceTypeToEnum<Device>::value;
+
+    this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, ct::DeviceType::CpuDevice, {this->n_basis}));
+
+    this->one = &one_;
+    this->zero = &zero_;
+    this->neg_one = &neg_one_;
+}
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::~DiagoPPCG()
+{
+    // h_prec is a ref to outside data, do not free.
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim)
+{
+    this->n_band = nband;
+    this->n_band_l = nband_l;
+    this->n_basis = nbasis;
+    this->n_dim = ndim;
+
+    this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+
+    this->psi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->w = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hw = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->p = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hp = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
+
+    this->nlocked = 0;
+    this->eigen_locked.resize(this->n_band, static_cast<Real>(0.0));
+
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false);
+
+    this->all_n_band_l.resize(this->plintrans.nproc_col);
+    MPI_Allgather(&this->n_band_l, 1, MPI_INT, this->all_n_band_l.data(), 1, MPI_INT, BP_WORLD);
+    this->band_displs.resize(this->plintrans.nproc_col);
+    this->band_displs[0] = 0;
+    for (int i = 1; i < this->plintrans.nproc_col; ++i)
+    {
+        this->band_displs[i] = this->band_displs[i - 1] + this->all_n_band_l[i - 1];
+    }
+#else
+    this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false);
+    this->all_n_band_l = {this->n_band_l};
+    this->band_displs = {0};
+#endif
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_prec()
+{
+    syncmem_var_h2d_op()(this->prec.template data<Real>(), this->h_prec.template data<Real>(), this->n_basis);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out)
+{
+    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_grad(const ct::Tensor& prec_in,
+                                     ct::Tensor& err_out,
+                                     ct::Tensor& psi_in,
+                                     ct::Tensor& hpsi_in,
+                                     ct::Tensor& grad_out,
+                                     const int nlocked_in)
+{
+    int start_nband = 0;
+#ifdef __MPI
+    if (this->plintrans.nproc_col > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    int local_nlocked = std::max(0, nlocked_in - start_nband);
+    local_nlocked = std::min(local_nlocked, this->n_band_l);
+
+    // Zero out locked bands
+    for (int ib = 0; ib < local_nlocked; ++ib)
+    {
+        setmem_complex_op()(grad_out.data<T>() + ib * this->n_basis, 0, this->n_basis);
+        err_out.data<Real>()[ib] = static_cast<Real>(0.0);
+    }
+
+    for (int ib = local_nlocked; ib < this->n_band_l; ++ib)
+    {
+        T* psi_ptr = psi_in.data<T>() + ib * this->n_basis;
+        T* hpsi_ptr = hpsi_in.data<T>() + ib * this->n_basis;
+        T* grad_ptr = grad_out.data<T>() + ib * this->n_basis;
+
+        // 1. Normalize psi (and hpsi consistently)
+        Real norm = ModuleBase::dot_real_op<T, Device>()(this->n_dim, psi_ptr, psi_ptr, true);
+        norm = 1.0 / sqrt(norm);
+        ModuleBase::vector_div_constant_op<T, Device>()(this->n_dim, psi_ptr, psi_ptr, norm);
+        ModuleBase::vector_div_constant_op<T, Device>()(this->n_dim, hpsi_ptr, hpsi_ptr, norm);
+
+        // 2. Rayleigh quotient: epsilo = <psi|hpsi>
+        Real epsilo = ModuleBase::dot_real_op<T, Device>()(this->n_dim, psi_ptr, hpsi_ptr, true);
+
+        // 3. Residual: grad = hpsi - epsilo * psi
+        ModuleBase::vector_add_vector_op<T, Device>()(this->n_dim, grad_ptr, hpsi_ptr, 1.0, psi_ptr, -epsilo);
+
+        // 4. Error = ||raw residual||
+        Real err = ModuleBase::dot_real_op<T, Device>()(this->n_dim, grad_ptr, grad_ptr, true);
+        err_out.data<Real>()[ib] = sqrt(err);
+
+        // 5. Apply preconditioner: grad = grad / prec
+        ModuleBase::vector_div_vector_op<T, Device>()(this->n_dim, grad_ptr, grad_ptr, prec_in.data<Real>());
+    }
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::update_locking(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    // Gather local errors to global array
+    std::vector<Real> local_err(this->n_band_l);
+    if (err_in.device_type() == ct::DeviceType::GpuDevice)
+    {
+        syncmem_var_d2h_op()(local_err.data(), err_in.data<Real>(), this->n_band_l);
+    }
+    else
+    {
+        std::memcpy(local_err.data(), err_in.data<Real>(), this->n_band_l * sizeof(Real));
+    }
+
+    std::vector<Real> global_err(this->n_band, static_cast<Real>(0.0));
+    std::vector<double> global_ethr(this->n_band, 0.0);
+
+#ifdef __MPI
+    MPI_Datatype mpi_real_type = (sizeof(Real) == sizeof(float)) ? MPI_FLOAT : MPI_DOUBLE;
+    MPI_Allgatherv(local_err.data(),
+                   this->n_band_l,
+                   mpi_real_type,
+                   global_err.data(),
+                   this->all_n_band_l.data(),
+                   this->band_displs.data(),
+                   mpi_real_type,
+                   BP_WORLD);
+
+    std::vector<double> local_ethr_double(ethr_band.begin(), ethr_band.end());
+    MPI_Allgatherv(local_ethr_double.data(),
+                   this->n_band_l,
+                   MPI_DOUBLE,
+                   global_ethr.data(),
+                   this->all_n_band_l.data(),
+                   this->band_displs.data(),
+                   MPI_DOUBLE,
+                   BP_WORLD);
+#else
+    for (int i = 0; i < this->n_band_l; ++i)
+    {
+        global_err[i] = local_err[i];
+        global_ethr[i] = ethr_band[i];
+    }
+#endif
+
+    // Gather current eigenvalues from device
+    std::vector<Real> current_eigen(this->n_band, static_cast<Real>(0.0));
+    syncmem_var_d2h_op()(current_eigen.data(), this->eigen.data<Real>(), this->n_band);
+
+    // Scan from current nlocked forward and lock converged bands
+    while (this->nlocked < this->n_band)
+    {
+        if (global_err[this->nlocked] <= global_ethr[this->nlocked])
+        {
+            this->eigen_locked[this->nlocked] = current_eigen[this->nlocked];
+            this->nlocked++;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out)
+{
+    // hsub_tmp = psi^H * grad (n_band x n_band global)
+    this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_tmp.data<T>());
+
+    // grad = grad - psi * hsub_tmp
+    this->plintrans.act(-1.0, psi_in.data<T>(), hsub_tmp.data<T>(), 1.0, grad_out.data<T>());
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::orth_cholesky(ct::Tensor& workspace_in,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out,
+                                         ct::Tensor& hsub_out)
+{
+    // hsub_out = psi_out^H * psi_out
+    this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());
+
+    ct::kernels::set_matrix<T, ct_Device>()('L', hsub_out.data<T>(), this->n_band);
+
+    ct::kernels::lapack_potrf<T, ct_Device>()('U', this->n_band, hsub_out.data<T>(), this->n_band);
+    ct::kernels::lapack_trtri<T, ct_Device>()('U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
+
+    // Rotate psi and hpsi
+    this->plintrans.act(1.0, psi_out.data<T>(), hsub_out.data<T>(), 0.0, workspace_in.data<T>());
+    syncmem_complex_op()(psi_out.data<T>(), workspace_in.data<T>(), this->n_band_l * this->n_basis);
+
+    this->plintrans.act(1.0, hpsi_out.data<T>(), hsub_out.data<T>(), 0.0, workspace_in.data<T>());
+    syncmem_complex_op()(hpsi_out.data<T>(), workspace_in.data<T>(), this->n_band_l * this->n_basis);
+}
+
+template <typename T, typename Device>
+bool DiagoPPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    Real* _err_st = err_in.data<Real>();
+    bool not_conv = false;
+    std::vector<Real> tmp_cpu;
+    if (err_in.device_type() == ct::DeviceType::GpuDevice)
+    {
+        tmp_cpu.resize(this->n_band_l);
+        _err_st = tmp_cpu.data();
+        syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
+    }
+    for (int ii = 0; ii < this->n_band_l; ++ii)
+    {
+        if (_err_st[ii] > ethr_band[ii])
+        {
+            not_conv = true;
+        }
+    }
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return not_conv;
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::diag_subspace(const HPsiFunc& hpsi_func,
+                                         const bool has_p,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out,
+                                         ct::Tensor& p_out,
+                                         ct::Tensor& hp_out,
+                                         const int nlocked_in)
+{
+    const int n_sub = has_p ? 3 * this->n_band : 2 * this->n_band;
+
+    // 1. Compute H|w>
+    this->calc_hpsi(hpsi_func, this->w.data<T>(), this->hw);
+
+    // 2. Compute overlap (S) and Hamiltonian (H) projection blocks.
+    //    Only upper-triangular blocks are computed explicitly;
+    //    lower-triangular parts are filled by Hermitian conjugate.
+    ct::Tensor b_00(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor b_01(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor b_11(t_type, device_type, {this->n_band, this->n_band});
+
+    this->pmmcn.multiply(one_, psi_out.data<T>(), psi_out.data<T>(), zero_, b_00.data<T>());
+    this->pmmcn.multiply(one_, psi_out.data<T>(), this->w.data<T>(), zero_, b_01.data<T>());
+    this->pmmcn.multiply(one_, this->w.data<T>(), this->w.data<T>(), zero_, b_11.data<T>());
+
+    ct::Tensor bh_00(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor bh_01(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor bh_11(t_type, device_type, {this->n_band, this->n_band});
+
+    this->pmmcn.multiply(one_, psi_out.data<T>(), hpsi_out.data<T>(), zero_, bh_00.data<T>());
+    this->pmmcn.multiply(one_, psi_out.data<T>(), this->hw.data<T>(), zero_, bh_01.data<T>());
+    this->pmmcn.multiply(one_, this->w.data<T>(), this->hw.data<T>(), zero_, bh_11.data<T>());
+
+    ct::Tensor b_02, b_12, b_22, bh_02, bh_12, bh_22;
+    if (has_p)
+    {
+        b_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        b_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        b_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+
+        this->pmmcn.multiply(one_, psi_out.data<T>(), p_out.data<T>(), zero_, b_02.data<T>());
+        this->pmmcn.multiply(one_, this->w.data<T>(), p_out.data<T>(), zero_, b_12.data<T>());
+        this->pmmcn.multiply(one_, p_out.data<T>(), p_out.data<T>(), zero_, b_22.data<T>());
+
+        this->pmmcn.multiply(one_, psi_out.data<T>(), hp_out.data<T>(), zero_, bh_02.data<T>());
+        this->pmmcn.multiply(one_, this->w.data<T>(), hp_out.data<T>(), zero_, bh_12.data<T>());
+        this->pmmcn.multiply(one_, p_out.data<T>(), hp_out.data<T>(), zero_, bh_22.data<T>());
+    }
+
+    // 3. Assemble projected matrices on CPU
+    ct::Tensor hsub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor ssub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor vcc_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor eigen_cpu(r_type, ct::DeviceType::CpuDevice, {n_sub});
+
+    // Helper to copy block and optionally Hermitian-conjugate transpose
+    auto copy_block = [&](const ct::Tensor& dev_block, int row_off, int col_off, bool to_h, bool hc) {
+        std::vector<T> tmp(this->n_band * this->n_band);
+        syncmem_complex_d2h_op()(tmp.data(), dev_block.data<T>(), this->n_band * this->n_band);
+        T* dest = to_h ? hsub_cpu.data<T>() : ssub_cpu.data<T>();
+        for (int j = 0; j < this->n_band; ++j)
+        {
+            for (int i = 0; i < this->n_band; ++i)
+            {
+                T val = hc ? std::conj(tmp[j + i * this->n_band]) : tmp[i + j * this->n_band];
+                dest[(row_off + i) + (col_off + j) * n_sub] = val;
+            }
+        }
+    };
+
+    // S_sub assembly
+    copy_block(b_00, 0, 0, false, false);
+    copy_block(b_01, 0, this->n_band, false, false);
+    copy_block(b_01, this->n_band, 0, false, true); // b_10 = b_01^H
+    copy_block(b_11, this->n_band, this->n_band, false, false);
+
+    // H_sub assembly
+    copy_block(bh_00, 0, 0, true, false);
+    copy_block(bh_01, 0, this->n_band, true, false);
+    copy_block(bh_01, this->n_band, 0, true, true); // bh_10 = bh_01^H
+    copy_block(bh_11, this->n_band, this->n_band, true, false);
+
+    if (has_p)
+    {
+        copy_block(b_02, 0, 2 * this->n_band, false, false);
+        copy_block(b_02, 2 * this->n_band, 0, false, true);
+        copy_block(b_12, this->n_band, 2 * this->n_band, false, false);
+        copy_block(b_12, 2 * this->n_band, this->n_band, false, true);
+        copy_block(b_22, 2 * this->n_band, 2 * this->n_band, false, false);
+
+        copy_block(bh_02, 0, 2 * this->n_band, true, false);
+        copy_block(bh_02, 2 * this->n_band, 0, true, true);
+        copy_block(bh_12, this->n_band, 2 * this->n_band, true, false);
+        copy_block(bh_12, 2 * this->n_band, this->n_band, true, true);
+        copy_block(bh_22, 2 * this->n_band, 2 * this->n_band, true, false);
+    }
+
+    // 4. Freeze locked bands: force their rows/columns to diagonal standard basis
+    if (nlocked_in > 0)
+    {
+        for (int i = 0; i < nlocked_in; ++i)
+        {
+            for (int j = 0; j < n_sub; ++j)
+            {
+                T s_val = (j == i) ? one_ : zero_;
+                T h_val = (j == i) ? static_cast<T>(this->eigen_locked[i]) : zero_;
+                hsub_cpu.data<T>()[i + j * n_sub] = h_val;
+                hsub_cpu.data<T>()[j + i * n_sub] = h_val;
+                ssub_cpu.data<T>()[i + j * n_sub] = s_val;
+                ssub_cpu.data<T>()[j + i * n_sub] = s_val;
+            }
+        }
+    }
+
+    // 5. Solve generalized eigenvalue problem H_sub * v = lambda * S_sub * v
+    hsolver::hegvd_op<T, base_device::DEVICE_CPU>()(nullptr,
+                                                    n_sub,
+                                                    n_sub,
+                                                    hsub_cpu.data<T>(),
+                                                    ssub_cpu.data<T>(),
+                                                    eigen_cpu.data<Real>(),
+                                                    vcc_cpu.data<T>());
+
+    // Ensure locked eigenvalues remain unchanged (overwrite in case of numerical drift)
+    for (int i = 0; i < nlocked_in && i < this->n_band; ++i)
+    {
+        eigen_cpu.data<Real>()[i] = this->eigen_locked[i];
+    }
+
+    // 6. Move eigenvectors back to device
+    ct::Tensor vcc_dev = vcc_cpu.to_device<ct_Device>();
+
+    // 7. Update psi = X*C_X + W*C_W + (P*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, psi_out.data<T>(), vcc_dev.data<T>() + 0, 0.0, this->work.data<T>());
+    this->plintrans.act(1.0, this->w.data<T>(), vcc_dev.data<T>() + this->n_band, 1.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, p_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(psi_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 8. Update hpsi = HX*C_X + HW*C_W + (HP*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, hpsi_out.data<T>(), vcc_dev.data<T>() + 0, 0.0, this->work.data<T>());
+    this->plintrans.act(1.0, this->hw.data<T>(), vcc_dev.data<T>() + this->n_band, 1.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, hp_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(hpsi_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 9. Update p = W*C_W + (P*C_P)  -- LOBPCG style, no X component
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->w.data<T>(), vcc_dev.data<T>() + this->n_band, 0.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, p_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(p_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 10. Update hp = HW*C_W + (HP*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->hw.data<T>(), vcc_dev.data<T>() + this->n_band, 0.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, hp_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(hp_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 11. Update eigenvalues with the lowest n_band eigenvalues from subspace diagonalization
+    syncmem_var_h2d_op()(this->eigen.data<Real>(), eigen_cpu.data<Real>(), this->n_band);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
+                                T* psi_in,
+                                Real* eigenvalue_in,
+                                const std::vector<double>& ethr_band)
+{
+    const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
+
+    // Map the input psi pointer
+    this->psi = std::move(ct::TensorMap(psi_in, t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    // Update precondition array
+    this->calc_prec();
+
+    // Initial subspace diagonalization to improve the initial guess
+    this->calc_hpsi(hpsi_func, psi_in, this->hpsi);
+
+    // Build and diagonalize the subspace Hamiltonian in the psi basis
+    ct::Tensor hsub_init(t_type, device_type, {this->n_band, this->n_band});
+    this->pmmcn.multiply(one_, this->hpsi.data<T>(), this->psi.data<T>(), zero_, hsub_init.data<T>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band,
+                                              hsub_init.data<T>(),
+                                              this->n_band,
+                                              this->eigen.data<Real>());
+
+    // Rotate psi and hpsi with the eigenvectors of the subspace problem
+    this->plintrans.act(1.0, this->psi.data<T>(), hsub_init.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->psi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->hpsi.data<T>(), hsub_init.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->hpsi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // Initialize search direction to zero
+    setmem_complex_op()(this->p.data<T>(), 0, this->n_band_l * this->n_basis);
+    setmem_complex_op()(this->hp.data<T>(), 0, this->n_band_l * this->n_basis);
+
+    // Allocate a reusable tensor for projection overlap
+    ct::Tensor hsub_orth(t_type, device_type, {this->n_band, this->n_band});
+
+    int ntry = 0;
+    int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6;
+    this->nlocked = 0;
+
+    do
+    {
+        ++ntry;
+
+        // 1. Calculate preconditioned residual w and error for active bands only
+        this->calc_grad(this->prec, this->err_st, this->psi, this->hpsi, this->w, this->nlocked);
+
+        // 2. Update locking status: scan from current nlocked forward
+        this->update_locking(this->err_st, ethr_band);
+
+        // 3. Exit if all bands have converged
+        if (this->nlocked >= this->n_band)
+        {
+            break;
+        }
+
+        // 4. Project active residual to orthogonal complement of psi
+        this->orth_projection(this->psi, hsub_orth, this->w);
+
+        // 5. Expanded subspace diagonalization with locking
+        //    Locked bands are frozen in the subspace problem
+        this->diag_subspace(hpsi_func, ntry > 1, this->psi, this->hpsi, this->p, this->hp, this->nlocked);
+
+        // Note: orth_cholesky is intentionally skipped here.
+        // The Rayleigh-Ritz step already provides orthonormal vectors (within numerical precision).
+        // Global Cholesky would destroy the locking by remixing all bands.
+
+    } while (ntry < max_iter && this->nlocked < this->n_band);
+
+    // Final subspace diagonalization to obtain accurate eigenvalues
+    this->pmmcn.multiply(one_, this->hpsi.data<T>(), this->psi.data<T>(), zero_, hsub_orth.data<T>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band,
+                                              hsub_orth.data<T>(),
+                                              this->n_band,
+                                              this->eigen.data<Real>());
+    this->plintrans.act(1.0, this->psi.data<T>(), hsub_orth.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->psi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // Copy eigenvalues to output
+    int start_nband = 0;
+#ifdef __MPI
+    if (this->plintrans.nproc_col > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.data<Real>() + start_nband, this->n_band_l);
+}
+
+// Explicit template instantiations
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_CPU>;
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_GPU>;
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hsolver
diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h
new file mode 100644
index 00000000000..d0ca5a2ebbb
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.h
@@ -0,0 +1,201 @@
+#ifndef DIAGO_PPCG_H_
+#define DIAGO_PPCG_H_
+
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_device/memory_op.h"
+#include "source_base/module_device/types.h"
+#include "source_base/para_gemm.h"
+#include "source_hsolver/kernels/hegvd_op.h"
+#include "source_hsolver/para_linear_transform.h"
+
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <source_base/macros.h>
+
+namespace hsolver
+{
+
+/**
+ * @class DiagoPPCG
+ * @brief A class for diagonalization using the Projected Preconditioned Conjugate Gradient (PPCG) method.
+ *
+ * The DiagoPPCG class implements a block LOBPCG-like algorithm for solving generalized eigenvalue problems.
+ * It uses an expanded subspace [X, W, P] where X is the current eigenvector approximation,
+ * W is the preconditioned residual, and P is the conjugate search direction from previous steps.
+ *
+ * @tparam T The floating-point type used for calculations.
+ * @tparam Device The device used for calculations (e.g., cpu or gpu).
+ */
+template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
+class DiagoPPCG
+{
+  private:
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    /**
+     * @brief Constructor for DiagoPPCG class.
+     *
+     * @param precondition_in Pointer to the host precondition array.
+     */
+    explicit DiagoPPCG(const Real* precondition_in);
+
+    /**
+     * @brief Destructor for DiagoPPCG class.
+     */
+    ~DiagoPPCG();
+
+    /**
+     * @brief Initialize the class before diagonalization.
+     *
+     * @param nband The number of bands of all processes.
+     * @param nband_l The number of bands of current process.
+     * @param nbasis The number of basis functions. Leading dimension of psi.
+     * @param ndim The number of valid dimension of psi.
+     */
+    void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);
+
+    using HPsiFunc = std::function<void(T*, T*, const int, const int)>;
+
+    /**
+     * @brief Diagonalize the Hamiltonian using the PPCG method.
+     *
+     * @param hpsi_func A function computing the product of the Hamiltonian matrix H
+     * and a wavefunction blockvector X.
+     * @param psi_in Pointer to input wavefunction psi matrix with [dim: n_basis x n_band, column major].
+     * @param eigenvalue_in Pointer to the eigen array with [dim: n_band].
+     * @param ethr_band Convergence threshold for each band.
+     */
+    void diag(const HPsiFunc& hpsi_func, T* psi_in, Real* eigenvalue_in, const std::vector<double>& ethr_band);
+
+  private:
+    /// the number of bands of all processes
+    int n_band = 0;
+    /// the number of bands of current process
+    int n_band_l = 0;
+    /// the number of cols of the input psi
+    int n_basis = 0;
+    /// valid dimension of psi
+    int n_dim = 0;
+    /// max iter steps for ppcg loop
+    int nline = 4;
+
+    /// parallel matrix multiplication
+    ModuleBase::PGemmCN<T, Device> pmmcn;
+    PLinearTransform<T, Device> plintrans;
+
+    ct::DataType r_type = ct::DataType::DT_INVALID;
+    ct::DataType t_type = ct::DataType::DT_INVALID;
+    ct::DeviceType device_type = ct::DeviceType::UnKnown;
+
+    ct::Tensor h_prec = {};
+    ct::Tensor prec = {};
+    ct::Tensor eigen = {};
+
+    /// Number of globally converged (locked) bands
+    int nlocked = 0;
+    /// Locked eigenvalues on CPU
+    std::vector<Real> eigen_locked;
+    /// MPI band distribution for error gathering
+    std::vector<int> all_n_band_l;
+    std::vector<int> band_displs;
+    ct::Tensor err_st = {};
+
+    ct::Tensor psi = {}, hpsi = {};
+    ct::Tensor w = {}, hw = {};
+    ct::Tensor p = {}, hp = {};
+    ct::Tensor work = {};
+
+    Device* ctx = {};
+    const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;
+    const T one_ = static_cast<T>(1.0), zero_ = static_cast<T>(0.0), neg_one_ = static_cast<T>(-1.0);
+
+    /**
+     * @brief Update the precondition array from host to device.
+     */
+    void calc_prec();
+
+    /**
+     * @brief Apply the H operator to psi and obtain the hpsi matrix.
+     */
+    void calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out);
+
+    /**
+     * @brief Calculate the preconditioned residual (gradient) and error.
+     *
+     * @param prec_in Input preconditioner.
+     * @param err_out Output error for each local band.
+     * @param psi_in Input wavefunction.
+     * @param hpsi_in H|psi> matrix.
+     * @param grad_out Output preconditioned residual.
+     */
+    void calc_grad(const ct::Tensor& prec_in,
+                   ct::Tensor& err_out,
+                   ct::Tensor& psi_in,
+                   ct::Tensor& hpsi_in,
+                   ct::Tensor& grad_out,
+                   const int nlocked_in = 0);
+
+    /**
+     * @brief Orthogonalize grad to psi using S-inner product (S=I for norm-conserving).
+     *
+     * @param psi_in Input wavefunction.
+     * @param hsub_tmp Workspace for overlap matrix.
+     * @param grad_out Input/Output gradient.
+     */
+    void orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out);
+
+    /**
+     * @brief Perform expanded subspace diagonalization and update X, P, HX, HP.
+     *
+     * @param hpsi_func Hamiltonian application function.
+     * @param has_p If true, use 3-block [X, W, P]; otherwise use 2-block [X, W].
+     * @param psi_out Input/Output wavefunction.
+     * @param hpsi_out Input/Output H|psi>.
+     * @param p_out Input/Output search direction.
+     * @param hp_out Input/Output H|p>.
+     */
+    void diag_subspace(const HPsiFunc& hpsi_func,
+                       const bool has_p,
+                       ct::Tensor& psi_out,
+                       ct::Tensor& hpsi_out,
+                       ct::Tensor& p_out,
+                       ct::Tensor& hp_out,
+                       const int nlocked_in = 0);
+
+    /**
+     * @brief Orthogonalize and normalize psi using Cholesky decomposition.
+     */
+    void orth_cholesky(ct::Tensor& workspace_in, ct::Tensor& psi_out, ct::Tensor& hpsi_out, ct::Tensor& hsub_out);
+
+    /**
+     * @brief Update locking status: scan errors from current nlocked forward
+     *        and lock bands that have converged.
+     */
+    void update_locking(const ct::Tensor& err_in, const std::vector<double>& ethr_band);
+
+    /**
+     * @brief Check if all bands have converged.
+     */
+    bool test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band);
+
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+    using setmem_var_op = ct::kernels::set_memory<Real, ct_Device>;
+    using resmem_var_op = ct::kernels::resize_memory<Real, ct_Device>;
+    using delmem_var_op = ct::kernels::delete_memory<Real, ct_Device>;
+    using syncmem_var_h2d_op = ct::kernels::synchronize_memory<Real, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_var_d2h_op = ct::kernels::synchronize_memory<Real, ct::DEVICE_CPU, ct_Device>;
+
+    using setmem_complex_op = ct::kernels::set_memory<T, ct_Device>;
+    using delmem_complex_op = ct::kernels::delete_memory<T, ct_Device>;
+    using resmem_complex_op = ct::kernels::resize_memory<T, ct_Device>;
+    using syncmem_complex_op = ct::kernels::synchronize_memory<T, ct_Device, ct_Device>;
+    using syncmem_complex_h2d_op = ct::kernels::synchronize_memory<T, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_complex_d2h_op = ct::kernels::synchronize_memory<T, ct::DEVICE_CPU, ct_Device>;
+
+    using gemm_op = ModuleBase::gemm_op<T, Device>;
+};
+
+} // namespace hsolver
+
+#endif // DIAGO_PPCG_H_
diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt
index 1b1529adb4a..13a0b0032dd 100644
--- a/source/source_hsolver/test/CMakeLists.txt
+++ b/source/source_hsolver/test/CMakeLists.txt
@@ -16,6 +16,17 @@ if (ENABLE_MPI)
             ../../source_hamilt/operator.cpp
             ../../source_pw/module_pwdft/op_pw.cpp
   )
+  AddTest(
+    TARGET MODULE_HSOLVER_ppcg_perf
+    LIBS parameter  ${math_libs} base psi device container
+    SOURCES diago_ppcg_perf_test.cpp
+            ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_cg.cpp ../diago_david.cpp
+            ../para_linear_transform.cpp ../diago_iter_assist.cpp ../diag_const_nums.cpp
+            ../kernels/hegvd_op.cpp
+            ../../source_basis/module_pw/test/test_tool.cpp
+            ../../source_hamilt/operator.cpp
+            ../../source_pw/module_pwdft/op_pw.cpp
+  )
   AddTest(
     TARGET MODULE_HSOLVER_cg
     LIBS parameter  ${math_libs} base psi device container
diff --git a/source/source_hsolver/test/diago_ppcg_perf_test.cpp b/source/source_hsolver/test/diago_ppcg_perf_test.cpp
new file mode 100644
index 00000000000..6983d2eaccc
--- /dev/null
+++ b/source/source_hsolver/test/diago_ppcg_perf_test.cpp
@@ -0,0 +1,284 @@
+#include "../diag_comm_info.h"
+#include "../diago_bpcg.h"
+#include "../diago_cg.h"
+#include "../diago_david.h"
+#include "../diago_iter_assist.h"
+#include "../diago_ppcg.h"
+#include "diago_mock.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_external/lapack_connector.h"
+#include "source_psi/psi.h"
+
+#include <cmath>
+#include <complex>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+#include <random>
+#include <vector>
+
+#ifdef __MPI
+#include "source_base/parallel_comm.h"
+#endif
+
+using T = std::complex<double>;
+
+// LAPACK reference eigenvalues (values only)
+void lapack_eigenvalues(int npw, const std::vector<T>& hm, double* e)
+{
+    std::vector<T> tmp = hm;
+    int lwork = 2 * npw;
+    std::vector<T> work(lwork);
+    std::vector<double> rwork(3 * npw - 2);
+    int info = 0;
+    char jobz = 'N', uplo = 'U';
+    zheev_(&jobz, &uplo, &npw, tmp.data(), &npw, e, work.data(), &lwork, rwork.data(), &info);
+}
+
+// Unified H|psi> via gemm
+auto make_hpsi_func(const std::vector<T>& hmat, int dim)
+{
+    return [hmat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) {
+        T one = T(1.0);
+        T zero = T(0.0);
+        base_device::DEVICE_CPU* ctx = {};
+        ModuleBase::gemm_op<T, base_device::DEVICE_CPU>()('N',
+                                                          'N',
+                                                          dim,
+                                                          nvec,
+                                                          dim,
+                                                          &one,
+                                                          hmat.data(),
+                                                          dim,
+                                                          psi_in,
+                                                          ld_psi,
+                                                          &zero,
+                                                          hpsi_out,
+                                                          ld_psi);
+    };
+}
+
+// S|psi> = |psi> (identity, norm-conserving)
+auto spsi_identity = [](T* psi_in, T* spsi_out, const int ld_psi, const int nvec) {
+    for (int i = 0; i < ld_psi * nvec; ++i)
+    {
+        spsi_out[i] = psi_in[i];
+    }
+};
+
+struct PerfResult
+{
+    std::string name;
+    double time = 0.0;
+    double max_err = 0.0;
+    bool converged = false;
+};
+
+// -------------------- PPCG --------------------
+PerfResult test_ppcg(int nband,
+                     int npw,
+                     double ethr,
+                     const psi::Psi<T>& psi0,
+                     const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                     double* precondition,
+                     const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoPPCG<T> ppcg(precondition);
+    ppcg.init_iter(nband, nband, npw, npw);
+    hsolver::DiagoIterAssist<T>::SCF_ITER = 1; // first SCF step
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    ppcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"PPCG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- BPCG --------------------
+PerfResult test_bpcg(int nband,
+                     int npw,
+                     double ethr,
+                     const psi::Psi<T>& psi0,
+                     const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                     double* precondition,
+                     const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoBPCG<T> bpcg(precondition);
+    bpcg.init_iter(nband, nband, npw, npw);
+    hsolver::DiagoIterAssist<T>::SCF_ITER = 1;
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    bpcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"BPCG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- CG --------------------
+PerfResult test_cg(int nband,
+                   int npw,
+                   double ethr,
+                   int maxiter,
+                   const psi::Psi<T>& psi0,
+                   const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                   double* precondition,
+                   const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoCG<T> cg("pw", "scf");
+    hsolver::DiagoIterAssist<T>::PW_DIAG_NMAX = maxiter;
+    hsolver::DiagoIterAssist<T>::PW_DIAG_THR = ethr;
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    cg.diag(hpsi_func, spsi_identity, npw, nband, npw, psi.get_pointer(), en.data(), ethr_band, precondition);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"CG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- Davidson --------------------
+PerfResult test_david(int nband,
+                      int npw,
+                      double ethr,
+                      int maxiter,
+                      const psi::Psi<T>& psi0,
+                      const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                      double* precondition,
+                      const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+#ifdef __MPI
+    int rank = 0, nproc = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    const hsolver::diag_comm_info comm_info(MPI_COMM_WORLD, rank, nproc);
+#else
+    const hsolver::diag_comm_info comm_info(0, 1);
+#endif
+
+    hsolver::DiagoDavid<T> david(precondition, nband, npw, 4, false, comm_info);
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    david.diag(hpsi_func, spsi_identity, npw, psi.get_pointer(), en.data(), ethr_band, maxiter);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"Davidson", t2 - t1, err, err < 1e-2};
+}
+
+// ============================================================
+int main(int argc, char** argv)
+{
+    MPI_Init(&argc, &argv);
+    int rank = 0, nproc = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+#ifdef __MPI
+    BP_WORLD = MPI_COMM_WORLD;
+#endif
+
+    // ---------- test parameters ----------
+    int nband = 20;
+    int npw = 500;
+    int sparsity = 0; // 0 = dense
+    double ethr = 1e-5;
+    int maxiter = 300;
+    // -------------------------------------
+
+    // generate Hamiltonian, precondition and initial guess
+    HPsi<T> hpsi_gen(nband, npw, sparsity);
+    DIAGOTEST::hmatrix = hpsi_gen.hamilt();
+    DIAGOTEST::npw = npw;
+    DIAGOTEST::npw_local = new int[1];
+    DIAGOTEST::npw_local[0] = npw;
+    DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix;
+
+    double* precondition = hpsi_gen.precond();
+
+    // LAPACK reference
+    std::vector<double> e_ref(npw);
+    lapack_eigenvalues(npw, DIAGOTEST::hmatrix, e_ref.data());
+
+    // initial psi guess (perturbed eigenvectors)
+    psi::Psi<T> psi0(1, nband, npw, npw, true);
+    std::default_random_engine p(1);
+    std::uniform_int_distribution<unsigned> u(1, 10);
+    for (int i = 0; i < nband; ++i)
+    {
+        for (int j = 0; j < npw; ++j)
+        {
+            double r = static_cast<double>(u(p)) / 10.0;
+            psi0(0, i, j) = DIAGOTEST::hmatrix[j * npw + i] * r;
+        }
+    }
+
+    auto hpsi_func = make_hpsi_func(DIAGOTEST::hmatrix_local, npw);
+
+    // run benchmarks
+    PerfResult r_ppcg = test_ppcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_bpcg = test_bpcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_cg = test_cg(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_david = test_david(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref);
+
+    if (rank == 0)
+    {
+        std::cout << "\n========================================\n";
+        std::cout << "  Diagonalization Performance Comparison\n";
+        std::cout << "  nband=" << nband << ", npw=" << npw << ", sparsity=" << sparsity << "\n";
+        std::cout << "========================================\n";
+        std::cout << std::setw(10) << "Method" << std::setw(14) << "Time(s)" << std::setw(14) << "MaxError"
+                  << std::setw(8) << "OK" << "\n";
+        std::cout << "----------------------------------------\n";
+        auto print = [](const PerfResult& r) {
+            std::cout << std::setw(10) << r.name << std::setw(14) << std::scientific << std::setprecision(3) << r.time
+                      << std::setw(14) << r.max_err << std::setw(8) << (r.converged ? "Yes" : "No") << "\n";
+        };
+        print(r_ppcg);
+        print(r_bpcg);
+        print(r_cg);
+        print(r_david);
+        std::cout << "========================================\n\n";
+    }
+
+    delete[] DIAGOTEST::npw_local;
+    MPI_Finalize();
+    return 0;
+}

From 3c96bc6fc938b41aba6c35b4ce17967cb637e52e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=BD=A6=E9=9C=96?=
 <16507150+liu-yanlin0518@user.noreply.gitee.com>
Date: Fri, 19 Jun 2026 19:28:28 +0800
Subject: [PATCH 2/2] add openMP in bpcg

---
 source/source_hsolver/diago_bpcg.cpp          | 213 +++++++++---------
 .../source_hsolver/kernels/bpcg_kernel_op.cpp |  96 +++++---
 2 files changed, 166 insertions(+), 143 deletions(-)

diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
index d4db3d790bc..e1ccaf7bbf3 100644
--- a/source/source_hsolver/diago_bpcg.cpp
+++ b/source/source_hsolver/diago_bpcg.cpp
@@ -1,62 +1,65 @@
 #include "source_hsolver/diago_bpcg.h"
 
 #include "diago_iter_assist.h"
+#include "para_linear_transform.h"
 #include "source_base/global_function.h"
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_comm.h" // different MPI worlds
 #include "source_hsolver/kernels/bpcg_kernel_op.h"
-#include "para_linear_transform.h"
 
 #include <ATen/kernels/blas.h>
 #include <ATen/kernels/lapack.h>
 #include <ATen/ops/einsum_op.h>
 #include <limits>
 
-namespace hsolver {
+namespace hsolver
+{
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 DiagoBPCG<T, Device>::DiagoBPCG(const Real* precondition_in)
 {
-    this->r_type   = ct::DataTypeToEnum<Real>::value;
-    this->t_type   = ct::DataTypeToEnum<T>::value;
-    this->device_type    = ct::DeviceTypeToEnum<Device>::value;
+    this->r_type = ct::DataTypeToEnum<Real>::value;
+    this->t_type = ct::DataTypeToEnum<T>::value;
+    this->device_type = ct::DeviceTypeToEnum<Device>::value;
 
-    this->h_prec  = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis}));
+    this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, device_type, {this->n_basis}));
 
     this->one = &one_;
     this->zero = &zero_;
     this->neg_one = &neg_one_;
 }
 
-template<typename T, typename Device>
-DiagoBPCG<T, Device>::~DiagoBPCG() {
+template <typename T, typename Device>
+DiagoBPCG<T, Device>::~DiagoBPCG()
+{
     // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) {
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim)
+{
     // Specify the problem size n_basis, n_band, while lda is n_basis
-    this->n_band        = nband;
-    this->n_band_l      = nband_l;
-    this->n_basis       = nbasis;
-    this->n_dim         = ndim;
+    this->n_band = nband;
+    this->n_band_l = nband_l;
+    this->n_basis = nbasis;
+    this->n_dim = ndim;
 
     // All column major tensors
 
-    this->beta          = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
-    this->eigen         = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
-    this->err_st        = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+    this->beta = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+    this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
 
-    this->hsub          = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band}));
+    this->hsub = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band}));
 
-    this->hpsi          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->work          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->hgrad         = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->grad_old      = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hgrad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->grad_old = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
 
-    this->prec          = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
+    this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
 
-    this->grad          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->grad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
 #ifdef __MPI
     this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
     this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false);
@@ -66,13 +69,14 @@ void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const i
 #endif
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
 {
     Real* _err_st = err_in.data<Real>();
     bool not_conv = false;
     std::vector<Real> tmp_cpu;
-    if (err_in.device_type() == ct::DeviceType::GpuDevice) {
+    if (err_in.device_type() == ct::DeviceType::GpuDevice)
+    {
         // ct::Tensor h_err_in = err_in.to_device<ct::DEVICE_CPU>();
         // _err_st = h_err_in.data<Real>();
         // qianrui change it, because it can not pass the valgrind test
@@ -80,11 +84,18 @@ bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vecto
         _err_st = tmp_cpu.data();
         syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
     }
-    for (int ii = 0; ii < this->n_band_l; ii++) {
-        if (_err_st[ii] > ethr_band[ii]) {
-            not_conv = true;
+    int not_conv_int = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(max : not_conv_int)
+#endif
+    for (int ii = 0; ii < this->n_band_l; ii++)
+    {
+        if (_err_st[ii] > ethr_band[ii])
+        {
+            not_conv_int = 1;
         }
     }
+    not_conv = (not_conv_int != 0);
 #ifdef __MPI
     MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
 #endif
@@ -92,12 +103,11 @@ bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vecto
 }
 
 // Finally, the last one!
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::line_minimize(
-    ct::Tensor& grad_in,
-    ct::Tensor& hgrad_in,
-    ct::Tensor& psi_out,
-    ct::Tensor& hpsi_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::line_minimize(ct::Tensor& grad_in,
+                                         ct::Tensor& hgrad_in,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out)
 {
     line_minimize_with_block_op<T, Device>()(grad_in.data<T>(),
                                              hgrad_in.data<T>(),
@@ -108,40 +118,34 @@ void DiagoBPCG<T, Device>::line_minimize(
                                              this->n_band_l);
 }
 
-
 // Finally, the last two!
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::orth_cholesky(
-		ct::Tensor& workspace_in,
-		ct::Tensor& psi_out,
-		ct::Tensor& hpsi_out,
-		ct::Tensor& hsub_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::orth_cholesky(ct::Tensor& workspace_in,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out,
+                                         ct::Tensor& hsub_out)
 {
     // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band)
     this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());
 
     // set hsub matrix to lower format;
-    ct::kernels::set_matrix<T, ct_Device>()(
-        'L', hsub_out.data<T>(), this->n_band);
+    ct::kernels::set_matrix<T, ct_Device>()('L', hsub_out.data<T>(), this->n_band);
 
-    ct::kernels::lapack_potrf<T, ct_Device>()(
-        'U', this->n_band, hsub_out.data<T>(), this->n_band);
-    ct::kernels::lapack_trtri<T, ct_Device>()(
-        'U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
+    ct::kernels::lapack_potrf<T, ct_Device>()('U', this->n_band, hsub_out.data<T>(), this->n_band);
+    ct::kernels::lapack_trtri<T, ct_Device>()('U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
 
     this->rotate_wf(hsub_out, psi_out, workspace_in);
     this->rotate_wf(hsub_out, hpsi_out, workspace_in);
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_grad_with_block(
-        const ct::Tensor& prec_in,
-        ct::Tensor& err_out,
-        ct::Tensor& beta_out,
-        ct::Tensor& psi_in,
-        ct::Tensor& hpsi_in,
-        ct::Tensor& grad_out,
-        ct::Tensor& grad_old_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_grad_with_block(const ct::Tensor& prec_in,
+                                                ct::Tensor& err_out,
+                                                ct::Tensor& beta_out,
+                                                ct::Tensor& psi_in,
+                                                ct::Tensor& hpsi_in,
+                                                ct::Tensor& grad_out,
+                                                ct::Tensor& grad_old_out)
 {
     calc_grad_with_block_op<T, Device>()(prec_in.data<Real>(),
                                          err_out.data<Real>(),
@@ -155,17 +159,14 @@ void DiagoBPCG<T, Device>::calc_grad_with_block(
                                          this->n_band_l);
 }
 
-template<typename T, typename Device>
+template <typename T, typename Device>
 void DiagoBPCG<T, Device>::calc_prec()
 {
     syncmem_var_h2d_op()(this->prec.template data<Real>(), this->h_prec.template data<Real>(), this->n_basis);
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::orth_projection(
-        const ct::Tensor& psi_in,
-        ct::Tensor& hsub_in,
-        ct::Tensor& grad_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_in, ct::Tensor& grad_out)
 {
     // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
     this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_in.data<T>());
@@ -176,11 +177,8 @@ void DiagoBPCG<T, Device>::orth_projection(
     return;
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::rotate_wf(
-        const ct::Tensor& hsub_in,
-        ct::Tensor& psi_out,
-        ct::Tensor& workspace_in)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::rotate_wf(const ct::Tensor& hsub_in, ct::Tensor& psi_out, ct::Tensor& workspace_in)
 {
     // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
     this->plintrans.act(1.0, psi_out.data<T>(), hsub_in.data<T>(), 0.0, workspace_in.data<T>());
@@ -189,47 +187,46 @@ void DiagoBPCG<T, Device>::rotate_wf(
     return;
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hpsi_with_block(
-        const HPsiFunc& hpsi_func,
-        T *psi_in,
-        ct::Tensor& hpsi_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hpsi_with_block(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out)
 {
     // calculate all-band hpsi
     hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::diag_hsub(
-        const ct::Tensor& psi_in,
-        const ct::Tensor& hpsi_in,
-        ct::Tensor& hsub_out,
-        ct::Tensor& eigenvalue_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::diag_hsub(const ct::Tensor& psi_in,
+                                     const ct::Tensor& hpsi_in,
+                                     ct::Tensor& hsub_out,
+                                     ct::Tensor& eigenvalue_out)
 {
     // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band)
     this->pmmcn.multiply(1.0, hpsi_in.data<T>(), psi_in.data<T>(), 0.0, hsub_out.data<T>());
 
-    // ct::kernels::lapack_heevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
-    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band, hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
+    // ct::kernels::lapack_heevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band,
+    // eigenvalue_out.data<Real>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band,
+                                              hsub_out.data<T>(),
+                                              this->n_band,
+                                              eigenvalue_out.data<Real>());
 
     return;
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hsub_with_block(
-        const HPsiFunc& hpsi_func,
-        T *psi_in,
-        ct::Tensor& psi_out,
-        ct::Tensor& hpsi_out,
-        ct::Tensor& hsub_out,
-        ct::Tensor& workspace_in,
-        ct::Tensor& eigenvalue_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hsub_with_block(const HPsiFunc& hpsi_func,
+                                                T* psi_in,
+                                                ct::Tensor& psi_out,
+                                                ct::Tensor& hpsi_out,
+                                                ct::Tensor& hsub_out,
+                                                ct::Tensor& workspace_in,
+                                                ct::Tensor& eigenvalue_out)
 {
     // Apply the H operator to psi and obtain the hpsi matrix.
     this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out);
 
     // Diagonalization of the subspace matrix.
-    this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out);
+    this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out);
 
     // inplace matmul to get the initial guessed wavefunction psi.
     // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
@@ -240,13 +237,12 @@ void DiagoBPCG<T, Device>::calc_hsub_with_block(
     return;
 }
 
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hsub_with_block_exit(
-        ct::Tensor& psi_out,
-        ct::Tensor& hpsi_out,
-        ct::Tensor& hsub_out,
-        ct::Tensor& workspace_in,
-        ct::Tensor& eigenvalue_out)
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hsub_with_block_exit(ct::Tensor& psi_out,
+                                                     ct::Tensor& hpsi_out,
+                                                     ct::Tensor& hsub_out,
+                                                     ct::Tensor& workspace_in,
+                                                     ct::Tensor& eigenvalue_out)
 {
     // Diagonalization of the subspace matrix.
     this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out);
@@ -266,7 +262,8 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
 {
     const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
     // Get the pointer of the input psi
-    this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->psi = std::move(
+        ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));
 
     // Update the precondition array
     this->calc_prec();
@@ -279,9 +276,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
     setmem_var_op()(this->beta.template data<Real>(), std::numeric_limits<Real>::infinity(), this->n_band_l);
 
     int ntry = 0;
-    int max_iter = current_scf_iter > 1 ?
-                   this->nline :
-                   this->nline * 6;
+    int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6;
     do
     {
         ++ntry;
@@ -291,8 +286,13 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
         // 3. calculate the gradient by hpsi - epsilo * psi
         // 4. gradient mix with the previous gradient
         // 5. Do precondition
-        this->calc_grad_with_block(this->prec, this->err_st, this->beta,
-                                 this->psi, this->hpsi, this->grad, this->grad_old);
+        this->calc_grad_with_block(this->prec,
+                                   this->err_st,
+                                   this->beta,
+                                   this->psi,
+                                   this->hpsi,
+                                   this->grad,
+                                   this->grad_old);
 
         // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi
         // for all 'j less or equal to i'.
@@ -314,7 +314,8 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
         // orthogonal psi by cholesky method
         this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
 
-        if (current_scf_iter == 1 && ntry % this->nline == 0) {
+        if (current_scf_iter == 1 && ntry % this->nline == 0)
+        {
             this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
         }
     } while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index 88f94e288c6..cc2a5f7736f 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -1,7 +1,9 @@
 #include "source_hsolver/kernels/bpcg_kernel_op.h"
-#include "source_base/module_external/blas_connector.h"
+
 #include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_external/blas_connector.h"
 #include "source_base/parallel_reduce.h"
+
 #include <vector>
 namespace hsolver
 {
@@ -26,6 +28,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : epsilo_0, epsilo_1, epsilo_2)
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -41,6 +46,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
             cos_theta = std::cos(theta);
             sin_theta = std::sin(theta);
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -77,6 +85,9 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : epsilo)
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -85,6 +96,9 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                 epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
             Parallel_Reduce::reduce_pool(epsilo);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : err, beta)
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -95,6 +109,9 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
             }
             Parallel_Reduce::reduce_pool(err);
             Parallel_Reduce::reduce_pool(beta);
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -111,8 +128,16 @@ template <typename T>
 struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
+    void operator()(const int& nbase,
+                    const int& nbase_x,
+                    const int& notconv,
+                    T* result,
+                    const T* vectors,
+                    const Real* eigenvalues)
     {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
         for (int m = 0; m < notconv; m++)
         {
             for (int idx = 0; idx < nbase; idx++)
@@ -124,59 +149,62 @@ struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
 };
 
 template <typename T>
-struct precondition_op<T, base_device::DEVICE_CPU> {
+struct precondition_op<T, base_device::DEVICE_CPU>
+{
     using Real = typename GetTypeReal<T>::type;
     void operator()(const int& dim,
-                   T* psi_iter,
-                   const int& nbase,
-                   const int& notconv,
-                   const Real* precondition,
-                   const Real* eigenvalues)
+                    T* psi_iter,
+                    const int& nbase,
+                    const int& notconv,
+                    const Real* precondition,
+                    const Real* eigenvalues)
     {
-        std::vector<Real> pre(dim, 0.0);
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
         for (int m = 0; m < notconv; m++)
         {
+            std::vector<Real> pre(dim, 0.0);
             for (size_t i = 0; i < dim; i++)
             {
                 Real x = std::abs(precondition[i] - eigenvalues[m]);
                 pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
             }
-            ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(
-                                                             dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             pre.data());
+            ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(dim,
+                                                                           psi_iter + (nbase + m) * dim,
+                                                                           psi_iter + (nbase + m) * dim,
+                                                                           pre.data());
         }
     }
 };
 
 template <typename T>
-struct normalize_op<T, base_device::DEVICE_CPU> {
+struct normalize_op<T, base_device::DEVICE_CPU>
+{
     void operator()(const int& dim,
-                   T* psi_iter,
-                   const int& nbase,
-                   const int& notconv,
-                   typename GetTypeReal<T>::type* psi_norm)
+                    T* psi_iter,
+                    const int& nbase,
+                    const int& notconv,
+                    typename GetTypeReal<T>::type* psi_norm)
     {
         using Real = typename GetTypeReal<T>::type;
         for (int m = 0; m < notconv; m++)
         {
             // Calculate norm using dot_real_op
-            Real psi_m_norm = ModuleBase::dot_real_op<T, base_device::DEVICE_CPU>()(
-                                                                dim,
-                                                                psi_iter + (nbase + m) * dim,
-                                                                psi_iter + (nbase + m) * dim,
-                                                                true);
+            Real psi_m_norm = ModuleBase::dot_real_op<T, base_device::DEVICE_CPU>()(dim,
+                                                                                    psi_iter + (nbase + m) * dim,
+                                                                                    psi_iter + (nbase + m) * dim,
+                                                                                    true);
             assert(psi_m_norm > 0.0);
             psi_m_norm = sqrt(psi_m_norm);
 
             // Normalize using vector_div_constant_op
-            ModuleBase::vector_div_constant_op<T, base_device::DEVICE_CPU>()(
-                                                              dim,
-                                                              psi_iter + (nbase + m) * dim,
-                                                              psi_iter + (nbase + m) * dim,
-                                                              psi_m_norm);
-            if (psi_norm) {
+            ModuleBase::vector_div_constant_op<T, base_device::DEVICE_CPU>()(dim,
+                                                                             psi_iter + (nbase + m) * dim,
+                                                                             psi_iter + (nbase + m) * dim,
+                                                                             psi_m_norm);
+            if (psi_norm)
+            {
                 psi_norm[m] = psi_m_norm;
             }
         }
@@ -187,13 +215,7 @@ template <typename T>
 struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int &n,
-                  T *hcc,
-                  T *scc,
-                  T *vcc,
-                  const int &ldh,
-                  const Real *eigenvalue,
-                  const T &one)
+    void operator()(const int& n, T* hcc, T* scc, T* vcc, const int& ldh, const Real* eigenvalue, const T& one)
     {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(1) schedule(static)