From 18438c85079af7dc4dc0c6aef1dd1bc544ac14cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BD=A6=E9=9C=96?= <16507150+liu-yanlin0518@user.noreply.gitee.com> Date: Sat, 30 May 2026 20:52:43 +0800 Subject: [PATCH 1/2] ppcg method --- source/source_hsolver/CMakeLists.txt | 1 + source/source_hsolver/diago_ppcg.cpp | 542 ++++++++++++++++++ source/source_hsolver/diago_ppcg.h | 201 +++++++ source/source_hsolver/test/CMakeLists.txt | 11 + .../test/diago_ppcg_perf_test.cpp | 284 +++++++++ 5 files changed, 1039 insertions(+) create mode 100644 source/source_hsolver/diago_ppcg.cpp create mode 100644 source/source_hsolver/diago_ppcg.h create mode 100644 source/source_hsolver/test/diago_ppcg_perf_test.cpp diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt index b115d6d4cd2..6b364562a04 100644 --- a/source/source_hsolver/CMakeLists.txt +++ b/source/source_hsolver/CMakeLists.txt @@ -13,6 +13,7 @@ list(APPEND objects diago_pxxxgvx.cpp diag_hs_para.cpp diago_params.cpp + diago_ppcg.cpp ) diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp new file mode 100644 index 00000000000..fc49be83679 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.cpp @@ -0,0 +1,542 @@ +#include "source_hsolver/diago_ppcg.h" + +#include "diago_iter_assist.h" +#include "para_linear_transform.h" +#include "source_base/global_function.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/parallel_comm.h" + +#include +#include +#include +#include + +namespace hsolver +{ + +template +DiagoPPCG::DiagoPPCG(const Real* precondition_in) +{ + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + this->device_type = ct::DeviceTypeToEnum::value; + + this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, ct::DeviceType::CpuDevice, {this->n_basis})); + + this->one = &one_; + this->zero = &zero_; + this->neg_one = &neg_one_; +} + +template +DiagoPPCG::~DiagoPPCG() +{ + // h_prec is a ref to outside data, do not free. +} + +template +void DiagoPPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) +{ + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; + + this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); + this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + + this->psi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->w = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hw = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->p = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hp = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + + this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); + + this->nlocked = 0; + this->eigen_locked.resize(this->n_band, static_cast(0.0)); + +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false); + + this->all_n_band_l.resize(this->plintrans.nproc_col); + MPI_Allgather(&this->n_band_l, 1, MPI_INT, this->all_n_band_l.data(), 1, MPI_INT, BP_WORLD); + this->band_displs.resize(this->plintrans.nproc_col); + this->band_displs[0] = 0; + for (int i = 1; i < this->plintrans.nproc_col; ++i) + { + this->band_displs[i] = this->band_displs[i - 1] + this->all_n_band_l[i - 1]; + } +#else + this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false); + this->all_n_band_l = {this->n_band_l}; + this->band_displs = {0}; +#endif +} + +template +void DiagoPPCG::calc_prec() +{ + syncmem_var_h2d_op()(this->prec.template data(), this->h_prec.template data(), this->n_basis); +} + +template +void DiagoPPCG::calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out) +{ + hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); +} + +template +void DiagoPPCG::calc_grad(const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + const int nlocked_in) +{ + int start_nband = 0; +#ifdef __MPI + if (this->plintrans.nproc_col > 1) + { + start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; + } +#endif + int local_nlocked = std::max(0, nlocked_in - start_nband); + local_nlocked = std::min(local_nlocked, this->n_band_l); + + // Zero out locked bands + for (int ib = 0; ib < local_nlocked; ++ib) + { + setmem_complex_op()(grad_out.data() + ib * this->n_basis, 0, this->n_basis); + err_out.data()[ib] = static_cast(0.0); + } + + for (int ib = local_nlocked; ib < this->n_band_l; ++ib) + { + T* psi_ptr = psi_in.data() + ib * this->n_basis; + T* hpsi_ptr = hpsi_in.data() + ib * this->n_basis; + T* grad_ptr = grad_out.data() + ib * this->n_basis; + + // 1. Normalize psi (and hpsi consistently) + Real norm = ModuleBase::dot_real_op()(this->n_dim, psi_ptr, psi_ptr, true); + norm = 1.0 / sqrt(norm); + ModuleBase::vector_div_constant_op()(this->n_dim, psi_ptr, psi_ptr, norm); + ModuleBase::vector_div_constant_op()(this->n_dim, hpsi_ptr, hpsi_ptr, norm); + + // 2. Rayleigh quotient: epsilo = + Real epsilo = ModuleBase::dot_real_op()(this->n_dim, psi_ptr, hpsi_ptr, true); + + // 3. Residual: grad = hpsi - epsilo * psi + ModuleBase::vector_add_vector_op()(this->n_dim, grad_ptr, hpsi_ptr, 1.0, psi_ptr, -epsilo); + + // 4. Error = ||raw residual|| + Real err = ModuleBase::dot_real_op()(this->n_dim, grad_ptr, grad_ptr, true); + err_out.data()[ib] = sqrt(err); + + // 5. Apply preconditioner: grad = grad / prec + ModuleBase::vector_div_vector_op()(this->n_dim, grad_ptr, grad_ptr, prec_in.data()); + } +} + +template +void DiagoPPCG::update_locking(const ct::Tensor& err_in, const std::vector& ethr_band) +{ + // Gather local errors to global array + std::vector local_err(this->n_band_l); + if (err_in.device_type() == ct::DeviceType::GpuDevice) + { + syncmem_var_d2h_op()(local_err.data(), err_in.data(), this->n_band_l); + } + else + { + std::memcpy(local_err.data(), err_in.data(), this->n_band_l * sizeof(Real)); + } + + std::vector global_err(this->n_band, static_cast(0.0)); + std::vector global_ethr(this->n_band, 0.0); + +#ifdef __MPI + MPI_Datatype mpi_real_type = (sizeof(Real) == sizeof(float)) ? MPI_FLOAT : MPI_DOUBLE; + MPI_Allgatherv(local_err.data(), + this->n_band_l, + mpi_real_type, + global_err.data(), + this->all_n_band_l.data(), + this->band_displs.data(), + mpi_real_type, + BP_WORLD); + + std::vector local_ethr_double(ethr_band.begin(), ethr_band.end()); + MPI_Allgatherv(local_ethr_double.data(), + this->n_band_l, + MPI_DOUBLE, + global_ethr.data(), + this->all_n_band_l.data(), + this->band_displs.data(), + MPI_DOUBLE, + BP_WORLD); +#else + for (int i = 0; i < this->n_band_l; ++i) + { + global_err[i] = local_err[i]; + global_ethr[i] = ethr_band[i]; + } +#endif + + // Gather current eigenvalues from device + std::vector current_eigen(this->n_band, static_cast(0.0)); + syncmem_var_d2h_op()(current_eigen.data(), this->eigen.data(), this->n_band); + + // Scan from current nlocked forward and lock converged bands + while (this->nlocked < this->n_band) + { + if (global_err[this->nlocked] <= global_ethr[this->nlocked]) + { + this->eigen_locked[this->nlocked] = current_eigen[this->nlocked]; + this->nlocked++; + } + else + { + break; + } + } +} + +template +void DiagoPPCG::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out) +{ + // hsub_tmp = psi^H * grad (n_band x n_band global) + this->pmmcn.multiply(1.0, psi_in.data(), grad_out.data(), 0.0, hsub_tmp.data()); + + // grad = grad - psi * hsub_tmp + this->plintrans.act(-1.0, psi_in.data(), hsub_tmp.data(), 1.0, grad_out.data()); +} + +template +void DiagoPPCG::orth_cholesky(ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out) +{ + // hsub_out = psi_out^H * psi_out + this->pmmcn.multiply(1.0, psi_out.data(), psi_out.data(), 0.0, hsub_out.data()); + + ct::kernels::set_matrix()('L', hsub_out.data(), this->n_band); + + ct::kernels::lapack_potrf()('U', this->n_band, hsub_out.data(), this->n_band); + ct::kernels::lapack_trtri()('U', 'N', this->n_band, hsub_out.data(), this->n_band); + + // Rotate psi and hpsi + this->plintrans.act(1.0, psi_out.data(), hsub_out.data(), 0.0, workspace_in.data()); + syncmem_complex_op()(psi_out.data(), workspace_in.data(), this->n_band_l * this->n_basis); + + this->plintrans.act(1.0, hpsi_out.data(), hsub_out.data(), 0.0, workspace_in.data()); + syncmem_complex_op()(hpsi_out.data(), workspace_in.data(), this->n_band_l * this->n_basis); +} + +template +bool DiagoPPCG::test_error(const ct::Tensor& err_in, const std::vector& ethr_band) +{ + Real* _err_st = err_in.data(); + bool not_conv = false; + std::vector tmp_cpu; + if (err_in.device_type() == ct::DeviceType::GpuDevice) + { + tmp_cpu.resize(this->n_band_l); + _err_st = tmp_cpu.data(); + syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); + } + for (int ii = 0; ii < this->n_band_l; ++ii) + { + if (_err_st[ii] > ethr_band[ii]) + { + not_conv = true; + } + } +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); +#endif + return not_conv; +} + +template +void DiagoPPCG::diag_subspace(const HPsiFunc& hpsi_func, + const bool has_p, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& p_out, + ct::Tensor& hp_out, + const int nlocked_in) +{ + const int n_sub = has_p ? 3 * this->n_band : 2 * this->n_band; + + // 1. Compute H|w> + this->calc_hpsi(hpsi_func, this->w.data(), this->hw); + + // 2. Compute overlap (S) and Hamiltonian (H) projection blocks. + // Only upper-triangular blocks are computed explicitly; + // lower-triangular parts are filled by Hermitian conjugate. + ct::Tensor b_00(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor b_01(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor b_11(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), psi_out.data(), zero_, b_00.data()); + this->pmmcn.multiply(one_, psi_out.data(), this->w.data(), zero_, b_01.data()); + this->pmmcn.multiply(one_, this->w.data(), this->w.data(), zero_, b_11.data()); + + ct::Tensor bh_00(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor bh_01(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor bh_11(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), hpsi_out.data(), zero_, bh_00.data()); + this->pmmcn.multiply(one_, psi_out.data(), this->hw.data(), zero_, bh_01.data()); + this->pmmcn.multiply(one_, this->w.data(), this->hw.data(), zero_, bh_11.data()); + + ct::Tensor b_02, b_12, b_22, bh_02, bh_12, bh_22; + if (has_p) + { + b_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + b_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + b_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), p_out.data(), zero_, b_02.data()); + this->pmmcn.multiply(one_, this->w.data(), p_out.data(), zero_, b_12.data()); + this->pmmcn.multiply(one_, p_out.data(), p_out.data(), zero_, b_22.data()); + + this->pmmcn.multiply(one_, psi_out.data(), hp_out.data(), zero_, bh_02.data()); + this->pmmcn.multiply(one_, this->w.data(), hp_out.data(), zero_, bh_12.data()); + this->pmmcn.multiply(one_, p_out.data(), hp_out.data(), zero_, bh_22.data()); + } + + // 3. Assemble projected matrices on CPU + ct::Tensor hsub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor ssub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor vcc_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor eigen_cpu(r_type, ct::DeviceType::CpuDevice, {n_sub}); + + // Helper to copy block and optionally Hermitian-conjugate transpose + auto copy_block = [&](const ct::Tensor& dev_block, int row_off, int col_off, bool to_h, bool hc) { + std::vector tmp(this->n_band * this->n_band); + syncmem_complex_d2h_op()(tmp.data(), dev_block.data(), this->n_band * this->n_band); + T* dest = to_h ? hsub_cpu.data() : ssub_cpu.data(); + for (int j = 0; j < this->n_band; ++j) + { + for (int i = 0; i < this->n_band; ++i) + { + T val = hc ? std::conj(tmp[j + i * this->n_band]) : tmp[i + j * this->n_band]; + dest[(row_off + i) + (col_off + j) * n_sub] = val; + } + } + }; + + // S_sub assembly + copy_block(b_00, 0, 0, false, false); + copy_block(b_01, 0, this->n_band, false, false); + copy_block(b_01, this->n_band, 0, false, true); // b_10 = b_01^H + copy_block(b_11, this->n_band, this->n_band, false, false); + + // H_sub assembly + copy_block(bh_00, 0, 0, true, false); + copy_block(bh_01, 0, this->n_band, true, false); + copy_block(bh_01, this->n_band, 0, true, true); // bh_10 = bh_01^H + copy_block(bh_11, this->n_band, this->n_band, true, false); + + if (has_p) + { + copy_block(b_02, 0, 2 * this->n_band, false, false); + copy_block(b_02, 2 * this->n_band, 0, false, true); + copy_block(b_12, this->n_band, 2 * this->n_band, false, false); + copy_block(b_12, 2 * this->n_band, this->n_band, false, true); + copy_block(b_22, 2 * this->n_band, 2 * this->n_band, false, false); + + copy_block(bh_02, 0, 2 * this->n_band, true, false); + copy_block(bh_02, 2 * this->n_band, 0, true, true); + copy_block(bh_12, this->n_band, 2 * this->n_band, true, false); + copy_block(bh_12, 2 * this->n_band, this->n_band, true, true); + copy_block(bh_22, 2 * this->n_band, 2 * this->n_band, true, false); + } + + // 4. Freeze locked bands: force their rows/columns to diagonal standard basis + if (nlocked_in > 0) + { + for (int i = 0; i < nlocked_in; ++i) + { + for (int j = 0; j < n_sub; ++j) + { + T s_val = (j == i) ? one_ : zero_; + T h_val = (j == i) ? static_cast(this->eigen_locked[i]) : zero_; + hsub_cpu.data()[i + j * n_sub] = h_val; + hsub_cpu.data()[j + i * n_sub] = h_val; + ssub_cpu.data()[i + j * n_sub] = s_val; + ssub_cpu.data()[j + i * n_sub] = s_val; + } + } + } + + // 5. Solve generalized eigenvalue problem H_sub * v = lambda * S_sub * v + hsolver::hegvd_op()(nullptr, + n_sub, + n_sub, + hsub_cpu.data(), + ssub_cpu.data(), + eigen_cpu.data(), + vcc_cpu.data()); + + // Ensure locked eigenvalues remain unchanged (overwrite in case of numerical drift) + for (int i = 0; i < nlocked_in && i < this->n_band; ++i) + { + eigen_cpu.data()[i] = this->eigen_locked[i]; + } + + // 6. Move eigenvectors back to device + ct::Tensor vcc_dev = vcc_cpu.to_device(); + + // 7. Update psi = X*C_X + W*C_W + (P*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, psi_out.data(), vcc_dev.data() + 0, 0.0, this->work.data()); + this->plintrans.act(1.0, this->w.data(), vcc_dev.data() + this->n_band, 1.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, p_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(psi_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 8. Update hpsi = HX*C_X + HW*C_W + (HP*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, hpsi_out.data(), vcc_dev.data() + 0, 0.0, this->work.data()); + this->plintrans.act(1.0, this->hw.data(), vcc_dev.data() + this->n_band, 1.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, hp_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(hpsi_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 9. Update p = W*C_W + (P*C_P) -- LOBPCG style, no X component + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->w.data(), vcc_dev.data() + this->n_band, 0.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, p_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(p_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 10. Update hp = HW*C_W + (HP*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->hw.data(), vcc_dev.data() + this->n_band, 0.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, hp_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(hp_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 11. Update eigenvalues with the lowest n_band eigenvalues from subspace diagonalization + syncmem_var_h2d_op()(this->eigen.data(), eigen_cpu.data(), this->n_band); +} + +template +void DiagoPPCG::diag(const HPsiFunc& hpsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band) +{ + const int current_scf_iter = hsolver::DiagoIterAssist::SCF_ITER; + + // Map the input psi pointer + this->psi = std::move(ct::TensorMap(psi_in, t_type, device_type, {this->n_band_l, this->n_basis})); + + // Update precondition array + this->calc_prec(); + + // Initial subspace diagonalization to improve the initial guess + this->calc_hpsi(hpsi_func, psi_in, this->hpsi); + + // Build and diagonalize the subspace Hamiltonian in the psi basis + ct::Tensor hsub_init(t_type, device_type, {this->n_band, this->n_band}); + this->pmmcn.multiply(one_, this->hpsi.data(), this->psi.data(), zero_, hsub_init.data()); + ct::kernels::lapack_heevd()(this->n_band, + hsub_init.data(), + this->n_band, + this->eigen.data()); + + // Rotate psi and hpsi with the eigenvectors of the subspace problem + this->plintrans.act(1.0, this->psi.data(), hsub_init.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->psi.data(), this->work.data(), this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->hpsi.data(), hsub_init.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->hpsi.data(), this->work.data(), this->n_band_l * this->n_basis); + + // Initialize search direction to zero + setmem_complex_op()(this->p.data(), 0, this->n_band_l * this->n_basis); + setmem_complex_op()(this->hp.data(), 0, this->n_band_l * this->n_basis); + + // Allocate a reusable tensor for projection overlap + ct::Tensor hsub_orth(t_type, device_type, {this->n_band, this->n_band}); + + int ntry = 0; + int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6; + this->nlocked = 0; + + do + { + ++ntry; + + // 1. Calculate preconditioned residual w and error for active bands only + this->calc_grad(this->prec, this->err_st, this->psi, this->hpsi, this->w, this->nlocked); + + // 2. Update locking status: scan from current nlocked forward + this->update_locking(this->err_st, ethr_band); + + // 3. Exit if all bands have converged + if (this->nlocked >= this->n_band) + { + break; + } + + // 4. Project active residual to orthogonal complement of psi + this->orth_projection(this->psi, hsub_orth, this->w); + + // 5. Expanded subspace diagonalization with locking + // Locked bands are frozen in the subspace problem + this->diag_subspace(hpsi_func, ntry > 1, this->psi, this->hpsi, this->p, this->hp, this->nlocked); + + // Note: orth_cholesky is intentionally skipped here. + // The Rayleigh-Ritz step already provides orthonormal vectors (within numerical precision). + // Global Cholesky would destroy the locking by remixing all bands. + + } while (ntry < max_iter && this->nlocked < this->n_band); + + // Final subspace diagonalization to obtain accurate eigenvalues + this->pmmcn.multiply(one_, this->hpsi.data(), this->psi.data(), zero_, hsub_orth.data()); + ct::kernels::lapack_heevd()(this->n_band, + hsub_orth.data(), + this->n_band, + this->eigen.data()); + this->plintrans.act(1.0, this->psi.data(), hsub_orth.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->psi.data(), this->work.data(), this->n_band_l * this->n_basis); + + // Copy eigenvalues to output + int start_nband = 0; +#ifdef __MPI + if (this->plintrans.nproc_col > 1) + { + start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; + } +#endif + syncmem_var_d2h_op()(eigenvalue_in, this->eigen.data() + start_nband, this->n_band_l); +} + +// Explicit template instantiations +template class DiagoPPCG, base_device::DEVICE_CPU>; +template class DiagoPPCG, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class DiagoPPCG, base_device::DEVICE_GPU>; +template class DiagoPPCG, base_device::DEVICE_GPU>; +#endif + +} // namespace hsolver diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h new file mode 100644 index 00000000000..d0ca5a2ebbb --- /dev/null +++ b/source/source_hsolver/diago_ppcg.h @@ -0,0 +1,201 @@ +#ifndef DIAGO_PPCG_H_ +#define DIAGO_PPCG_H_ + +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_device/memory_op.h" +#include "source_base/module_device/types.h" +#include "source_base/para_gemm.h" +#include "source_hsolver/kernels/hegvd_op.h" +#include "source_hsolver/para_linear_transform.h" + +#include +#include +#include + +namespace hsolver +{ + +/** + * @class DiagoPPCG + * @brief A class for diagonalization using the Projected Preconditioned Conjugate Gradient (PPCG) method. + * + * The DiagoPPCG class implements a block LOBPCG-like algorithm for solving generalized eigenvalue problems. + * It uses an expanded subspace [X, W, P] where X is the current eigenvector approximation, + * W is the preconditioned residual, and P is the conjugate search direction from previous steps. + * + * @tparam T The floating-point type used for calculations. + * @tparam Device The device used for calculations (e.g., cpu or gpu). + */ +template , typename Device = base_device::DEVICE_CPU> +class DiagoPPCG +{ + private: + using Real = typename GetTypeReal::type; + + public: + /** + * @brief Constructor for DiagoPPCG class. + * + * @param precondition_in Pointer to the host precondition array. + */ + explicit DiagoPPCG(const Real* precondition_in); + + /** + * @brief Destructor for DiagoPPCG class. + */ + ~DiagoPPCG(); + + /** + * @brief Initialize the class before diagonalization. + * + * @param nband The number of bands of all processes. + * @param nband_l The number of bands of current process. + * @param nbasis The number of basis functions. Leading dimension of psi. + * @param ndim The number of valid dimension of psi. + */ + void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim); + + using HPsiFunc = std::function; + + /** + * @brief Diagonalize the Hamiltonian using the PPCG method. + * + * @param hpsi_func A function computing the product of the Hamiltonian matrix H + * and a wavefunction blockvector X. + * @param psi_in Pointer to input wavefunction psi matrix with [dim: n_basis x n_band, column major]. + * @param eigenvalue_in Pointer to the eigen array with [dim: n_band]. + * @param ethr_band Convergence threshold for each band. + */ + void diag(const HPsiFunc& hpsi_func, T* psi_in, Real* eigenvalue_in, const std::vector& ethr_band); + + private: + /// the number of bands of all processes + int n_band = 0; + /// the number of bands of current process + int n_band_l = 0; + /// the number of cols of the input psi + int n_basis = 0; + /// valid dimension of psi + int n_dim = 0; + /// max iter steps for ppcg loop + int nline = 4; + + /// parallel matrix multiplication + ModuleBase::PGemmCN pmmcn; + PLinearTransform plintrans; + + ct::DataType r_type = ct::DataType::DT_INVALID; + ct::DataType t_type = ct::DataType::DT_INVALID; + ct::DeviceType device_type = ct::DeviceType::UnKnown; + + ct::Tensor h_prec = {}; + ct::Tensor prec = {}; + ct::Tensor eigen = {}; + + /// Number of globally converged (locked) bands + int nlocked = 0; + /// Locked eigenvalues on CPU + std::vector eigen_locked; + /// MPI band distribution for error gathering + std::vector all_n_band_l; + std::vector band_displs; + ct::Tensor err_st = {}; + + ct::Tensor psi = {}, hpsi = {}; + ct::Tensor w = {}, hw = {}; + ct::Tensor p = {}, hp = {}; + ct::Tensor work = {}; + + Device* ctx = {}; + const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; + const T one_ = static_cast(1.0), zero_ = static_cast(0.0), neg_one_ = static_cast(-1.0); + + /** + * @brief Update the precondition array from host to device. + */ + void calc_prec(); + + /** + * @brief Apply the H operator to psi and obtain the hpsi matrix. + */ + void calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out); + + /** + * @brief Calculate the preconditioned residual (gradient) and error. + * + * @param prec_in Input preconditioner. + * @param err_out Output error for each local band. + * @param psi_in Input wavefunction. + * @param hpsi_in H|psi> matrix. + * @param grad_out Output preconditioned residual. + */ + void calc_grad(const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + const int nlocked_in = 0); + + /** + * @brief Orthogonalize grad to psi using S-inner product (S=I for norm-conserving). + * + * @param psi_in Input wavefunction. + * @param hsub_tmp Workspace for overlap matrix. + * @param grad_out Input/Output gradient. + */ + void orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out); + + /** + * @brief Perform expanded subspace diagonalization and update X, P, HX, HP. + * + * @param hpsi_func Hamiltonian application function. + * @param has_p If true, use 3-block [X, W, P]; otherwise use 2-block [X, W]. + * @param psi_out Input/Output wavefunction. + * @param hpsi_out Input/Output H|psi>. + * @param p_out Input/Output search direction. + * @param hp_out Input/Output H|p>. + */ + void diag_subspace(const HPsiFunc& hpsi_func, + const bool has_p, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& p_out, + ct::Tensor& hp_out, + const int nlocked_in = 0); + + /** + * @brief Orthogonalize and normalize psi using Cholesky decomposition. + */ + void orth_cholesky(ct::Tensor& workspace_in, ct::Tensor& psi_out, ct::Tensor& hpsi_out, ct::Tensor& hsub_out); + + /** + * @brief Update locking status: scan errors from current nlocked forward + * and lock bands that have converged. + */ + void update_locking(const ct::Tensor& err_in, const std::vector& ethr_band); + + /** + * @brief Check if all bands have converged. + */ + bool test_error(const ct::Tensor& err_in, const std::vector& ethr_band); + + using ct_Device = typename ct::PsiToContainer::type; + using setmem_var_op = ct::kernels::set_memory; + using resmem_var_op = ct::kernels::resize_memory; + using delmem_var_op = ct::kernels::delete_memory; + using syncmem_var_h2d_op = ct::kernels::synchronize_memory; + using syncmem_var_d2h_op = ct::kernels::synchronize_memory; + + using setmem_complex_op = ct::kernels::set_memory; + using delmem_complex_op = ct::kernels::delete_memory; + using resmem_complex_op = ct::kernels::resize_memory; + using syncmem_complex_op = ct::kernels::synchronize_memory; + using syncmem_complex_h2d_op = ct::kernels::synchronize_memory; + using syncmem_complex_d2h_op = ct::kernels::synchronize_memory; + + using gemm_op = ModuleBase::gemm_op; +}; + +} // namespace hsolver + +#endif // DIAGO_PPCG_H_ diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index 1b1529adb4a..13a0b0032dd 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -16,6 +16,17 @@ if (ENABLE_MPI) ../../source_hamilt/operator.cpp ../../source_pw/module_pwdft/op_pw.cpp ) + AddTest( + TARGET MODULE_HSOLVER_ppcg_perf + LIBS parameter ${math_libs} base psi device container + SOURCES diago_ppcg_perf_test.cpp + ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_cg.cpp ../diago_david.cpp + ../para_linear_transform.cpp ../diago_iter_assist.cpp ../diag_const_nums.cpp + ../kernels/hegvd_op.cpp + ../../source_basis/module_pw/test/test_tool.cpp + ../../source_hamilt/operator.cpp + ../../source_pw/module_pwdft/op_pw.cpp + ) AddTest( TARGET MODULE_HSOLVER_cg LIBS parameter ${math_libs} base psi device container diff --git a/source/source_hsolver/test/diago_ppcg_perf_test.cpp b/source/source_hsolver/test/diago_ppcg_perf_test.cpp new file mode 100644 index 00000000000..6983d2eaccc --- /dev/null +++ b/source/source_hsolver/test/diago_ppcg_perf_test.cpp @@ -0,0 +1,284 @@ +#include "../diag_comm_info.h" +#include "../diago_bpcg.h" +#include "../diago_cg.h" +#include "../diago_david.h" +#include "../diago_iter_assist.h" +#include "../diago_ppcg.h" +#include "diago_mock.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_external/lapack_connector.h" +#include "source_psi/psi.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __MPI +#include "source_base/parallel_comm.h" +#endif + +using T = std::complex; + +// LAPACK reference eigenvalues (values only) +void lapack_eigenvalues(int npw, const std::vector& hm, double* e) +{ + std::vector tmp = hm; + int lwork = 2 * npw; + std::vector work(lwork); + std::vector rwork(3 * npw - 2); + int info = 0; + char jobz = 'N', uplo = 'U'; + zheev_(&jobz, &uplo, &npw, tmp.data(), &npw, e, work.data(), &lwork, rwork.data(), &info); +} + +// Unified H|psi> via gemm +auto make_hpsi_func(const std::vector& hmat, int dim) +{ + return [hmat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) { + T one = T(1.0); + T zero = T(0.0); + base_device::DEVICE_CPU* ctx = {}; + ModuleBase::gemm_op()('N', + 'N', + dim, + nvec, + dim, + &one, + hmat.data(), + dim, + psi_in, + ld_psi, + &zero, + hpsi_out, + ld_psi); + }; +} + +// S|psi> = |psi> (identity, norm-conserving) +auto spsi_identity = [](T* psi_in, T* spsi_out, const int ld_psi, const int nvec) { + for (int i = 0; i < ld_psi * nvec; ++i) + { + spsi_out[i] = psi_in[i]; + } +}; + +struct PerfResult +{ + std::string name; + double time = 0.0; + double max_err = 0.0; + bool converged = false; +}; + +// -------------------- PPCG -------------------- +PerfResult test_ppcg(int nband, + int npw, + double ethr, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoPPCG ppcg(precondition); + ppcg.init_iter(nband, nband, npw, npw); + hsolver::DiagoIterAssist::SCF_ITER = 1; // first SCF step + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + ppcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"PPCG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- BPCG -------------------- +PerfResult test_bpcg(int nband, + int npw, + double ethr, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoBPCG bpcg(precondition); + bpcg.init_iter(nband, nband, npw, npw); + hsolver::DiagoIterAssist::SCF_ITER = 1; + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + bpcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"BPCG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- CG -------------------- +PerfResult test_cg(int nband, + int npw, + double ethr, + int maxiter, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoCG cg("pw", "scf"); + hsolver::DiagoIterAssist::PW_DIAG_NMAX = maxiter; + hsolver::DiagoIterAssist::PW_DIAG_THR = ethr; + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + cg.diag(hpsi_func, spsi_identity, npw, nband, npw, psi.get_pointer(), en.data(), ethr_band, precondition); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"CG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- Davidson -------------------- +PerfResult test_david(int nband, + int npw, + double ethr, + int maxiter, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + +#ifdef __MPI + int rank = 0, nproc = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + const hsolver::diag_comm_info comm_info(MPI_COMM_WORLD, rank, nproc); +#else + const hsolver::diag_comm_info comm_info(0, 1); +#endif + + hsolver::DiagoDavid david(precondition, nband, npw, 4, false, comm_info); + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + david.diag(hpsi_func, spsi_identity, npw, psi.get_pointer(), en.data(), ethr_band, maxiter); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"Davidson", t2 - t1, err, err < 1e-2}; +} + +// ============================================================ +int main(int argc, char** argv) +{ + MPI_Init(&argc, &argv); + int rank = 0, nproc = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + +#ifdef __MPI + BP_WORLD = MPI_COMM_WORLD; +#endif + + // ---------- test parameters ---------- + int nband = 20; + int npw = 500; + int sparsity = 0; // 0 = dense + double ethr = 1e-5; + int maxiter = 300; + // ------------------------------------- + + // generate Hamiltonian, precondition and initial guess + HPsi hpsi_gen(nband, npw, sparsity); + DIAGOTEST::hmatrix = hpsi_gen.hamilt(); + DIAGOTEST::npw = npw; + DIAGOTEST::npw_local = new int[1]; + DIAGOTEST::npw_local[0] = npw; + DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix; + + double* precondition = hpsi_gen.precond(); + + // LAPACK reference + std::vector e_ref(npw); + lapack_eigenvalues(npw, DIAGOTEST::hmatrix, e_ref.data()); + + // initial psi guess (perturbed eigenvectors) + psi::Psi psi0(1, nband, npw, npw, true); + std::default_random_engine p(1); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; ++i) + { + for (int j = 0; j < npw; ++j) + { + double r = static_cast(u(p)) / 10.0; + psi0(0, i, j) = DIAGOTEST::hmatrix[j * npw + i] * r; + } + } + + auto hpsi_func = make_hpsi_func(DIAGOTEST::hmatrix_local, npw); + + // run benchmarks + PerfResult r_ppcg = test_ppcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref); + PerfResult r_bpcg = test_bpcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref); + PerfResult r_cg = test_cg(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref); + PerfResult r_david = test_david(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref); + + if (rank == 0) + { + std::cout << "\n========================================\n"; + std::cout << " Diagonalization Performance Comparison\n"; + std::cout << " nband=" << nband << ", npw=" << npw << ", sparsity=" << sparsity << "\n"; + std::cout << "========================================\n"; + std::cout << std::setw(10) << "Method" << std::setw(14) << "Time(s)" << std::setw(14) << "MaxError" + << std::setw(8) << "OK" << "\n"; + std::cout << "----------------------------------------\n"; + auto print = [](const PerfResult& r) { + std::cout << std::setw(10) << r.name << std::setw(14) << std::scientific << std::setprecision(3) << r.time + << std::setw(14) << r.max_err << std::setw(8) << (r.converged ? "Yes" : "No") << "\n"; + }; + print(r_ppcg); + print(r_bpcg); + print(r_cg); + print(r_david); + std::cout << "========================================\n\n"; + } + + delete[] DIAGOTEST::npw_local; + MPI_Finalize(); + return 0; +} From 3c96bc6fc938b41aba6c35b4ce17967cb637e52e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E5=BD=A6=E9=9C=96?= <16507150+liu-yanlin0518@user.noreply.gitee.com> Date: Fri, 19 Jun 2026 19:28:28 +0800 Subject: [PATCH 2/2] add openMP in bpcg --- source/source_hsolver/diago_bpcg.cpp | 213 +++++++++--------- .../source_hsolver/kernels/bpcg_kernel_op.cpp | 96 +++++--- 2 files changed, 166 insertions(+), 143 deletions(-) diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index d4db3d790bc..e1ccaf7bbf3 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -1,62 +1,65 @@ #include "source_hsolver/diago_bpcg.h" #include "diago_iter_assist.h" +#include "para_linear_transform.h" #include "source_base/global_function.h" #include "source_base/kernels/math_kernel_op.h" #include "source_base/parallel_comm.h" // different MPI worlds #include "source_hsolver/kernels/bpcg_kernel_op.h" -#include "para_linear_transform.h" #include #include #include #include -namespace hsolver { +namespace hsolver +{ -template +template DiagoBPCG::DiagoBPCG(const Real* precondition_in) { - this->r_type = ct::DataTypeToEnum::value; - this->t_type = ct::DataTypeToEnum::value; - this->device_type = ct::DeviceTypeToEnum::value; + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + this->device_type = ct::DeviceTypeToEnum::value; - this->h_prec = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis})); + this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, device_type, {this->n_basis})); this->one = &one_; this->zero = &zero_; this->neg_one = &neg_one_; } -template -DiagoBPCG::~DiagoBPCG() { +template +DiagoBPCG::~DiagoBPCG() +{ // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data } -template -void DiagoBPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) { +template +void DiagoBPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) +{ // Specify the problem size n_basis, n_band, while lda is n_basis - this->n_band = nband; - this->n_band_l = nband_l; - this->n_basis = nbasis; - this->n_dim = ndim; + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; // All column major tensors - this->beta = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); - this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); - this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + this->beta = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); + this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); - this->hsub = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band})); + this->hsub = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band})); - this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->hgrad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->grad_old = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hgrad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->grad_old = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); + this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); - this->grad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->grad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); #ifdef __MPI this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false); @@ -66,13 +69,14 @@ void DiagoBPCG::init_iter(const int nband, const int nband_l, const i #endif } -template +template bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vector& ethr_band) { Real* _err_st = err_in.data(); bool not_conv = false; std::vector tmp_cpu; - if (err_in.device_type() == ct::DeviceType::GpuDevice) { + if (err_in.device_type() == ct::DeviceType::GpuDevice) + { // ct::Tensor h_err_in = err_in.to_device(); // _err_st = h_err_in.data(); // qianrui change it, because it can not pass the valgrind test @@ -80,11 +84,18 @@ bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vecto _err_st = tmp_cpu.data(); syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); } - for (int ii = 0; ii < this->n_band_l; ii++) { - if (_err_st[ii] > ethr_band[ii]) { - not_conv = true; + int not_conv_int = 0; +#ifdef _OPENMP +#pragma omp parallel for reduction(max : not_conv_int) +#endif + for (int ii = 0; ii < this->n_band_l; ii++) + { + if (_err_st[ii] > ethr_band[ii]) + { + not_conv_int = 1; } } + not_conv = (not_conv_int != 0); #ifdef __MPI MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); #endif @@ -92,12 +103,11 @@ bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vecto } // Finally, the last one! -template -void DiagoBPCG::line_minimize( - ct::Tensor& grad_in, - ct::Tensor& hgrad_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out) +template +void DiagoBPCG::line_minimize(ct::Tensor& grad_in, + ct::Tensor& hgrad_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out) { line_minimize_with_block_op()(grad_in.data(), hgrad_in.data(), @@ -108,40 +118,34 @@ void DiagoBPCG::line_minimize( this->n_band_l); } - // Finally, the last two! -template -void DiagoBPCG::orth_cholesky( - ct::Tensor& workspace_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out) +template +void DiagoBPCG::orth_cholesky(ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out) { // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band) this->pmmcn.multiply(1.0, psi_out.data(), psi_out.data(), 0.0, hsub_out.data()); // set hsub matrix to lower format; - ct::kernels::set_matrix()( - 'L', hsub_out.data(), this->n_band); + ct::kernels::set_matrix()('L', hsub_out.data(), this->n_band); - ct::kernels::lapack_potrf()( - 'U', this->n_band, hsub_out.data(), this->n_band); - ct::kernels::lapack_trtri()( - 'U', 'N', this->n_band, hsub_out.data(), this->n_band); + ct::kernels::lapack_potrf()('U', this->n_band, hsub_out.data(), this->n_band); + ct::kernels::lapack_trtri()('U', 'N', this->n_band, hsub_out.data(), this->n_band); this->rotate_wf(hsub_out, psi_out, workspace_in); this->rotate_wf(hsub_out, hpsi_out, workspace_in); } -template -void DiagoBPCG::calc_grad_with_block( - const ct::Tensor& prec_in, - ct::Tensor& err_out, - ct::Tensor& beta_out, - ct::Tensor& psi_in, - ct::Tensor& hpsi_in, - ct::Tensor& grad_out, - ct::Tensor& grad_old_out) +template +void DiagoBPCG::calc_grad_with_block(const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& beta_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + ct::Tensor& grad_old_out) { calc_grad_with_block_op()(prec_in.data(), err_out.data(), @@ -155,17 +159,14 @@ void DiagoBPCG::calc_grad_with_block( this->n_band_l); } -template +template void DiagoBPCG::calc_prec() { syncmem_var_h2d_op()(this->prec.template data(), this->h_prec.template data(), this->n_basis); } -template -void DiagoBPCG::orth_projection( - const ct::Tensor& psi_in, - ct::Tensor& hsub_in, - ct::Tensor& grad_out) +template +void DiagoBPCG::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_in, ct::Tensor& grad_out) { // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band) this->pmmcn.multiply(1.0, psi_in.data(), grad_out.data(), 0.0, hsub_in.data()); @@ -176,11 +177,8 @@ void DiagoBPCG::orth_projection( return; } -template -void DiagoBPCG::rotate_wf( - const ct::Tensor& hsub_in, - ct::Tensor& psi_out, - ct::Tensor& workspace_in) +template +void DiagoBPCG::rotate_wf(const ct::Tensor& hsub_in, ct::Tensor& psi_out, ct::Tensor& workspace_in) { // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) this->plintrans.act(1.0, psi_out.data(), hsub_in.data(), 0.0, workspace_in.data()); @@ -189,47 +187,46 @@ void DiagoBPCG::rotate_wf( return; } -template -void DiagoBPCG::calc_hpsi_with_block( - const HPsiFunc& hpsi_func, - T *psi_in, - ct::Tensor& hpsi_out) +template +void DiagoBPCG::calc_hpsi_with_block(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out) { // calculate all-band hpsi hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); } -template -void DiagoBPCG::diag_hsub( - const ct::Tensor& psi_in, - const ct::Tensor& hpsi_in, - ct::Tensor& hsub_out, - ct::Tensor& eigenvalue_out) +template +void DiagoBPCG::diag_hsub(const ct::Tensor& psi_in, + const ct::Tensor& hpsi_in, + ct::Tensor& hsub_out, + ct::Tensor& eigenvalue_out) { // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band) this->pmmcn.multiply(1.0, hpsi_in.data(), psi_in.data(), 0.0, hsub_out.data()); - // ct::kernels::lapack_heevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); - ct::kernels::lapack_heevd()(this->n_band, hsub_out.data(), this->n_band, eigenvalue_out.data()); + // ct::kernels::lapack_heevd()('V', 'U', hsub_out.data(), this->n_band, + // eigenvalue_out.data()); + ct::kernels::lapack_heevd()(this->n_band, + hsub_out.data(), + this->n_band, + eigenvalue_out.data()); return; } -template -void DiagoBPCG::calc_hsub_with_block( - const HPsiFunc& hpsi_func, - T *psi_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out, - ct::Tensor& workspace_in, - ct::Tensor& eigenvalue_out) +template +void DiagoBPCG::calc_hsub_with_block(const HPsiFunc& hpsi_func, + T* psi_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out, + ct::Tensor& workspace_in, + ct::Tensor& eigenvalue_out) { // Apply the H operator to psi and obtain the hpsi matrix. this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out); // Diagonalization of the subspace matrix. - this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out); + this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out); // inplace matmul to get the initial guessed wavefunction psi. // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] @@ -240,13 +237,12 @@ void DiagoBPCG::calc_hsub_with_block( return; } -template -void DiagoBPCG::calc_hsub_with_block_exit( - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out, - ct::Tensor& workspace_in, - ct::Tensor& eigenvalue_out) +template +void DiagoBPCG::calc_hsub_with_block_exit(ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out, + ct::Tensor& workspace_in, + ct::Tensor& eigenvalue_out) { // Diagonalization of the subspace matrix. this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out); @@ -266,7 +262,8 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, { const int current_scf_iter = hsolver::DiagoIterAssist::SCF_ITER; // Get the pointer of the input psi - this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis})); + this->psi = std::move( + ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis})); // Update the precondition array this->calc_prec(); @@ -279,9 +276,7 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, setmem_var_op()(this->beta.template data(), std::numeric_limits::infinity(), this->n_band_l); int ntry = 0; - int max_iter = current_scf_iter > 1 ? - this->nline : - this->nline * 6; + int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6; do { ++ntry; @@ -291,8 +286,13 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, // 3. calculate the gradient by hpsi - epsilo * psi // 4. gradient mix with the previous gradient // 5. Do precondition - this->calc_grad_with_block(this->prec, this->err_st, this->beta, - this->psi, this->hpsi, this->grad, this->grad_old); + this->calc_grad_with_block(this->prec, + this->err_st, + this->beta, + this->psi, + this->hpsi, + this->grad, + this->grad_old); // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi // for all 'j less or equal to i'. @@ -314,7 +314,8 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, // orthogonal psi by cholesky method this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); - if (current_scf_iter == 1 && ntry % this->nline == 0) { + if (current_scf_iter == 1 && ntry % this->nline == 0) + { this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); } } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 88f94e288c6..cc2a5f7736f 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -1,7 +1,9 @@ #include "source_hsolver/kernels/bpcg_kernel_op.h" -#include "source_base/module_external/blas_connector.h" + #include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_external/blas_connector.h" #include "source_base/parallel_reduce.h" + #include namespace hsolver { @@ -26,6 +28,9 @@ struct line_minimize_with_block_op Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : epsilo_0, epsilo_1, epsilo_2) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -41,6 +46,9 @@ struct line_minimize_with_block_op theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); sin_theta = std::sin(theta); +#ifdef _OPENMP +#pragma omp parallel for +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -77,6 +85,9 @@ struct calc_grad_with_block_op Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : epsilo) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -85,6 +96,9 @@ struct calc_grad_with_block_op epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } Parallel_Reduce::reduce_pool(epsilo); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : err, beta) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -95,6 +109,9 @@ struct calc_grad_with_block_op } Parallel_Reduce::reduce_pool(err); Parallel_Reduce::reduce_pool(beta); +#ifdef _OPENMP +#pragma omp parallel for +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -111,8 +128,16 @@ template struct apply_eigenvalues_op { using Real = typename GetTypeReal::type; - void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues) + void operator()(const int& nbase, + const int& nbase_x, + const int& notconv, + T* result, + const T* vectors, + const Real* eigenvalues) { +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif for (int m = 0; m < notconv; m++) { for (int idx = 0; idx < nbase; idx++) @@ -124,59 +149,62 @@ struct apply_eigenvalues_op }; template -struct precondition_op { +struct precondition_op +{ using Real = typename GetTypeReal::type; void operator()(const int& dim, - T* psi_iter, - const int& nbase, - const int& notconv, - const Real* precondition, - const Real* eigenvalues) + T* psi_iter, + const int& nbase, + const int& notconv, + const Real* precondition, + const Real* eigenvalues) { - std::vector pre(dim, 0.0); +#ifdef _OPENMP +#pragma omp parallel for +#endif for (int m = 0; m < notconv; m++) { + std::vector pre(dim, 0.0); for (size_t i = 0; i < dim; i++) { Real x = std::abs(precondition[i] - eigenvalues[m]); pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); } - ModuleBase::vector_div_vector_op()( - dim, - psi_iter + (nbase + m) * dim, - psi_iter + (nbase + m) * dim, - pre.data()); + ModuleBase::vector_div_vector_op()(dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + pre.data()); } } }; template -struct normalize_op { +struct normalize_op +{ void operator()(const int& dim, - T* psi_iter, - const int& nbase, - const int& notconv, - typename GetTypeReal::type* psi_norm) + T* psi_iter, + const int& nbase, + const int& notconv, + typename GetTypeReal::type* psi_norm) { using Real = typename GetTypeReal::type; for (int m = 0; m < notconv; m++) { // Calculate norm using dot_real_op - Real psi_m_norm = ModuleBase::dot_real_op()( - dim, - psi_iter + (nbase + m) * dim, - psi_iter + (nbase + m) * dim, - true); + Real psi_m_norm = ModuleBase::dot_real_op()(dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + true); assert(psi_m_norm > 0.0); psi_m_norm = sqrt(psi_m_norm); // Normalize using vector_div_constant_op - ModuleBase::vector_div_constant_op()( - dim, - psi_iter + (nbase + m) * dim, - psi_iter + (nbase + m) * dim, - psi_m_norm); - if (psi_norm) { + ModuleBase::vector_div_constant_op()(dim, + psi_iter + (nbase + m) * dim, + psi_iter + (nbase + m) * dim, + psi_m_norm); + if (psi_norm) + { psi_norm[m] = psi_m_norm; } } @@ -187,13 +215,7 @@ template struct refresh_hcc_scc_vcc_op { using Real = typename GetTypeReal::type; - void operator()(const int &n, - T *hcc, - T *scc, - T *vcc, - const int &ldh, - const Real *eigenvalue, - const T &one) + void operator()(const int& n, T* hcc, T* scc, T* vcc, const int& ldh, const Real* eigenvalue, const T& one) { #ifdef _OPENMP #pragma omp parallel for collapse(1) schedule(static)