From 1c09df2777f6738c4e2413241af0a57b7fe2fdbd Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 12:16:39 +0800 Subject: [PATCH 1/8] feat(lcao): auto-recommend nb2d from ScaLAPACK 2D block-cyclic load balance Derive a healthy nb2d window from the per-pool process grid (q) and matrix dim N=nlocal: nb_lo=min(16,N/(2q)) <= nb2d <= nb_hi=floor(N/(2q)), recommended nb2d=min(64,nb_hi). Avoids both the large-block load-imbalance cliff and the nb=1 over-scatter end of the U-shaped time-vs-nb2d curve. Prints the recommendation in the run-info banner; energy is unaffected (performance-only). Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_io/module_output/print_info.cpp | 27 ++++++ source/source_lcao/LCAO_init_basis.cpp | 83 +++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp index 398cbb49a8f..6aff116257b 100644 --- a/source/source_io/module_output/print_info.cpp +++ b/source/source_io/module_output/print_info.cpp @@ -412,6 +412,33 @@ void print_kpar(const int &nks, const int &kpar_lcao) "%%%%%%%%%%%%\n"; } } + + // 16) recommend the optimal kpar for k-point parallelism. + // kpar splits the processes into pools, each diagonalizing a subset of the nks + // k-points independently -> near-linear speedup of the k-point loop. The ceiling is + // the number of k-points, so the optimal kpar is the largest divisor of NPROC that + // does not exceed nks (more pools than k-points leaves pools idle). Perfect balance + // additionally wants nks % kpar == 0 (see the warning above); very large systems may + // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization. + // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here. + int kpar_opt = 1; + for (int d = GlobalV::NPROC; d >= 1; --d) + { + if (GlobalV::NPROC % d == 0 && d <= nks) + { + kpar_opt = d; + break; + } + } + if (kpar_opt != kpar_lcao) + { + GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = " + << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = " + << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize" + << " the k-point loop.\n"; + ModuleBase::WARNING("ModuleIO::print_kpar", + "kpar is not optimal; see running log for the recommended value."); + } } } // namespace ModuleIO diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index c18e859599a..743ca8e80d7 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -3,6 +3,8 @@ #include "source_io/module_parameter/parameter.h" #include "source_base/parallel_comm.h" +#include + namespace LCAO_domain { @@ -78,6 +80,87 @@ void init_basis_lcao(Parallel_Orbitals& pv, try_nb = pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); } + // ---- nb2d (ScaLAPACK 2D block-cyclic block size) load-balance check ---- + // ScaLAPACK diagonalizes the N x N matrix (N = nlocal) on a p x q process grid + // (p <= q) with square block size nb2d. With kpar k-point pools the diagonalization + // runs *per pool* on NPROC/kpar processes (Parallel_K2D::P2D_pool, whose block size + // is ParaV->get_block_size() == pv.nb), so the effective grid is the near-square + // factorization of NPROC/kpar -- exactly how Parallel_2D builds it. The long edge q + // governs load balance: + // B = N / (nb2d * q) blocks owned per process along q. + // B <= 1 : one process owns a whole panel while others idle -> a catastrophic + // load-imbalance "cliff" (nb2d too large; energy unaffected, time blows up). + // But nb2d also must not be too SMALL: blocks below ~16 lose BLAS/GEMM efficiency and + // explode block-cyclic communication (nb=1 = the slow "over-scatter" end). So the time- + // vs-nb2d curve is U-shaped and the healthy window is two-sided: + // nb_lo = min(16, N/(2q)) <= nb2d <= nb_hi = floor(N/(2q)) + // recommended nb2d = min(64, nb_hi) (largest balanced block, capped for BLAS). + // This block-size U-curve is a property of the ScaLAPACK 2D block-cyclic *dense* + // diagonalization, so the check is restricted to ks_solver=scalapack_gvx. (genelpa/ + // elpa do their own internal block tuning; lapack/cusolver/pexsi do not diagonalize + // on this distributed 2D grid, so the nb2d cliff does not apply to them.) + if (PARAM.inp.ks_solver == "scalapack_gvx") + { + const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1; + const int np_total = pv.dim0 * pv.dim1; // diagonalization world size + const int np_pool = (kpar > 0) ? np_total / kpar : np_total; // processes per pool + if (np_pool > 1 && nlocal > 0) + { + // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D) + int p_row = static_cast(std::sqrt(np_pool + 0.5)); + while (p_row > 1 && np_pool % p_row != 0) { --p_row; } + const int p_col = np_pool / p_row; // long edge q (>= p_row) + + // Healthy block-size window is two-sided (the nb2d-vs-time curve is U-shaped): + // nb_hi = floor(N / (2*q)) upper bound -- keep >= 2 blocks per process + // (B >= 2); larger nb -> load imbalance "cliff". + // nb_lo = min(16, nb_hi) lower bound -- blocks below ~16 lose BLAS/GEMM + // efficiency (1x1 ops) and explode block-cyclic + // communication (panel count ~ N/nb); nb=1 is the + // slow "over-scatter" end. Capped by nb_hi because a + // tiny system on many processes cannot afford large + // blocks (then balance wins and nb_lo == nb_hi). + // recommended = the largest balanced block, capped at 64 for BLAS efficiency. + const int nb_hi = (nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1; + const int nb_lo = (16 < nb_hi) ? 16 : nb_hi; + const int nb_opt = (nb_hi < 64) ? nb_hi : 64; + const int nb_cur = pv.nb; + + const char* issue = nullptr; + if (nb_cur > nb_hi) + { + issue = "too large -> ScaLAPACK load imbalance (one process owns a whole panel)"; + } + else if (nb_cur < nb_lo) + { + issue = "too small -> over-scatter (poor BLAS efficiency and heavy communication)"; + } + + // Two cases, both reported via ofs_warning: + // (1) user explicitly set nb2d (PARAM.inp.nb2d != 0): respect it -- do NOT + // change the value -- but warn so the issue is visible. + // (2) auto nb2d (PARAM.inp.nb2d == 0): correct it to nb_opt. pv.nb feeds both + // the kpar==1 path and Parallel_K2D (ParaV->get_block_size()), so + // resetting it here also fixes the per-pool diagonalization. + if (issue != nullptr) + { + if (PARAM.inp.nb2d != 0) + { + GlobalV::ofs_warning << "init_basis_lcao: user-set nb2d=" << nb_cur << " is " << issue + << " for N=" << nlocal << ", kpar=" << kpar << " (per-pool grid " << p_row << "x" + << p_col << "); recommended nb2d=" << nb_opt << " (user value kept, not changed).\n"; + } + else + { + GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue + << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n"; + pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt); + pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); + } + } + } + } + // init blacs context for genelpa pv.set_desc_wfc_Eij(nlocal, PARAM.inp.nbands, pv.nrow); From 92e2b1e854908e023c5ce02a5f9ddb9f353cd078 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 12:52:30 +0800 Subject: [PATCH 2/8] fix(lcao): address Copilot review on nb2d/kpar advisory - LCAO_init_basis: skip the nb2d load-balance heuristic when NPROC is not divisible by kpar (pool size ill-defined) instead of using a truncated np_total/kpar that yields a wrong process-grid estimate. - print_info: start the kpar divisor scan at min(nks, NPROC) so out-of-range candidates are skipped rather than scanning all of [1, NPROC]. - print_info: drop the ModuleBase::WARNING for a non-optimal kpar -- it is advisory only and the truly problematic cases (nks % kpar != 0, kpar > nks) are already warned about above. Keep the running-log advisory line. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_io/module_output/print_info.cpp | 12 ++++++++---- source/source_lcao/LCAO_init_basis.cpp | 6 +++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp index 6aff116257b..620f7bc7b10 100644 --- a/source/source_io/module_output/print_info.cpp +++ b/source/source_io/module_output/print_info.cpp @@ -421,10 +421,12 @@ void print_kpar(const int &nks, const int &kpar_lcao) // additionally wants nks % kpar == 0 (see the warning above); very large systems may // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization. // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here. + // start the downward divisor scan at min(nks, NPROC) so we skip the d > nks + // candidates outright instead of iterating all of [1, NPROC]. int kpar_opt = 1; - for (int d = GlobalV::NPROC; d >= 1; --d) + for (int d = (nks < GlobalV::NPROC) ? nks : GlobalV::NPROC; d >= 1; --d) { - if (GlobalV::NPROC % d == 0 && d <= nks) + if (GlobalV::NPROC % d == 0) { kpar_opt = d; break; @@ -432,12 +434,14 @@ void print_kpar(const int &nks, const int &kpar_lcao) } if (kpar_opt != kpar_lcao) { + // Advisory only: many kpar choices are deliberate (e.g. keeping more ranks + // per pool for the per-pool diagonalization), so this is an info-level note + // rather than a WARNING. The genuinely problematic cases (nks not divisible + // by kpar, or kpar > nks leaving idle pools) are already flagged above. GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = " << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = " << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize" << " the k-point loop.\n"; - ModuleBase::WARNING("ModuleIO::print_kpar", - "kpar is not optimal; see running log for the recommended value."); } } diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index 743ca8e80d7..c64958c35ad 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -103,7 +103,11 @@ void init_basis_lcao(Parallel_Orbitals& pv, { const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1; const int np_total = pv.dim0 * pv.dim1; // diagonalization world size - const int np_pool = (kpar > 0) ? np_total / kpar : np_total; // processes per pool + // Pools split the world evenly; if np_total is not divisible by kpar the + // pool size is ill-defined, so skip the (purely advisory) heuristic rather + // than estimate the grid from a truncated np_total/kpar. + const bool pool_well_defined = (kpar > 0 && np_total % kpar == 0); + const int np_pool = pool_well_defined ? np_total / kpar : 0; // processes per pool if (np_pool > 1 && nlocal > 0) { // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D) From 81fa44a8c9860f892936a4342986a480c1b46499 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 13:30:45 +0800 Subject: [PATCH 3/8] fix(lcao): validate auto-adjusted nb2d, revert if band/grid-incompatible The nb2d auto-adjust ignored the return value of set_nloc_wfc_Eij. When the recommended nb_opt (derived from matrix dim + grid only) makes ceil(nbands/nb_opt) smaller than the process-grid width, set_nloc_wfc_Eij returns early without updating ncol_bands/nrow_bands/nloc_wfc, leaving pv half-updated (new nb/nrow from pv.set, stale band sizes). This segfaulted the gamma-only scalapack_gvx cases (02_NAO_Gamma: scf_bsse, scf_force_stress, scf_out_hk, scf_out_hxc; N=26, nbands=6, 2x2 grid -> nb_opt=6 -> block=1 --- source/source_lcao/LCAO_init_basis.cpp | 27 ++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index c64958c35ad..c8f1a005558 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -156,10 +156,29 @@ void init_basis_lcao(Parallel_Orbitals& pv, } else { - GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue - << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n"; - pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt); - pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); + // Re-distribute with the recommended block size, but validate it the + // same way the initial distribution is validated above: set_nloc_wfc_Eij + // returns non-zero when nb_opt is incompatible with the band/grid layout + // (e.g. ceil(nbands/nb_opt) < process-grid width). nb_opt is derived from + // the matrix dimension and grid alone, so it can violate that band + // constraint on small systems. If it does, revert to the previously + // validated nb2d -- leaving pv half-updated (new nb/nrow but stale + // band sizes) would otherwise crash the later wavefunction setup. + int retry = pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt); + retry += pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); + if (retry != 0) + { + pv.set(nlocal, nlocal, nb_cur, pv.blacs_ctxt); + pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); + GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue + << " for N=" << nlocal << ", kpar=" << kpar << "; recommended nb2d=" << nb_opt + << " is incompatible with the band/grid layout, so nb2d=" << nb_cur << " is kept.\n"; + } + else + { + GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue + << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n"; + } } } } From 2ddf0de2a7dd1f33b6d1f7e80e6c26feb81646dd Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 13:39:26 +0800 Subject: [PATCH 4/8] fix(lcao): derive per-pool size from the real pool communicator, not pv grid The nb2d load-balance check estimated the per-pool process count as pv.dim0 * pv.dim1 / kpar. But pv is built on DIAG_WORLD (size = diago_proc), whereas hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools of NPROC/kpar ranks each (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)), independent of diago_proc. So whenever diago_proc < NPROC the old expression gave diago_proc/kpar != NPROC/kpar -- a wrong (p_row, p_col) grid and hence a wrong nb_hi/nb_lo window and auto-adjust decision. Take the pool size from the actual source instead: - kpar == 1: the diagonalization runs on this ParaV grid -> pv.dim0 * pv.dim1. - kpar > 1: NPROC / kpar (the parakSolve pool size); skip when NPROC % kpar != 0 since the pools are then uneven and have no single size. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_lcao/LCAO_init_basis.cpp | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index c8f1a005558..1261ae06d8d 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -102,12 +102,25 @@ void init_basis_lcao(Parallel_Orbitals& pv, if (PARAM.inp.ks_solver == "scalapack_gvx") { const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1; - const int np_total = pv.dim0 * pv.dim1; // diagonalization world size - // Pools split the world evenly; if np_total is not divisible by kpar the - // pool size is ill-defined, so skip the (purely advisory) heuristic rather - // than estimate the grid from a truncated np_total/kpar. - const bool pool_well_defined = (kpar > 0 && np_total % kpar == 0); - const int np_pool = pool_well_defined ? np_total / kpar : 0; // processes per pool + // Processes that actually run ONE (per-pool) diagonalization: + // kpar == 1 : the full diagonalization grid IS this ParaV grid (built on + // DIAG_WORLD), so use pv.dim0 * pv.dim1 directly. + // kpar > 1 : hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools + // (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)), NOT the + // DIAG_WORLD grid -- each pool owns NPROC/kpar ranks. Deriving + // the pool size from pv.dim0*pv.dim1/kpar would be wrong whenever + // diago_proc < NPROC (then DIAG_WORLD != NPROC). When NPROC is + // not divisible by kpar the pools are uneven (no single size), + // so skip the purely advisory heuristic. + int np_pool = 0; // processes per pool (0 => skip the check) + if (kpar <= 1) + { + np_pool = pv.dim0 * pv.dim1; + } + else if (GlobalV::NPROC % kpar == 0) + { + np_pool = GlobalV::NPROC / kpar; + } if (np_pool > 1 && nlocal > 0) { // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D) From 9d8160a786ab251bfb763529c9e6e1d638f59e1f Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 14:18:51 +0800 Subject: [PATCH 5/8] fix(lcao): keep auto-recommended nb2d even for noncollinear nspin==4 For nspin==4 the basis carries 2-component spinors that must stay paired within a ScaLAPACK block, so nb2d must be a multiple of 2 (the autoset and fallback already use 2, not 1, for this reason). The nb2d recommendation nb_opt = min(64, floor(N/2q)) could be odd (e.g. N=108, q=2 -> 27), which broke the spinor blocking and segfaulted the nspin==4 diagonalization (03_NAO_multik: scf_u_spin4, scf_out_dos_spin4, scf_out_mul_spin4). Snap the whole nb2d window (nb_lo/nb_hi/nb_opt) to the nspin granularity. Verified locally in the abacus-gnu Docker image: 02_NAO_Gamma, 03_NAO_multik and 05_rtTDDFT integration suites all pass 100%. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_lcao/LCAO_init_basis.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index 1261ae06d8d..9b074fe9fc7 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -138,9 +138,17 @@ void init_basis_lcao(Parallel_Orbitals& pv, // tiny system on many processes cannot afford large // blocks (then balance wins and nb_lo == nb_hi). // recommended = the largest balanced block, capped at 64 for BLAS efficiency. - const int nb_hi = (nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1; - const int nb_lo = (16 < nb_hi) ? 16 : nb_hi; - const int nb_opt = (nb_hi < 64) ? nb_hi : 64; + // For noncollinear nspin==4 the basis carries 2-component spinors that must + // stay paired inside one ScaLAPACK block, so the block size must be a multiple + // of 2 -- this is exactly why the autoset above and the fallback use nb2d=2 (not + // 1) for nspin==4. Snap the whole window to that granularity so the recommended + // value can never break the spinor blocking (an odd nb2d segfaults the nspin==4 + // diagonalization). + const int nb_unit = (PARAM.inp.nspin == 4) ? 2 : 1; + auto snap = [nb_unit](int v) { v = v / nb_unit * nb_unit; return v < nb_unit ? nb_unit : v; }; + const int nb_hi = snap((nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1); + const int nb_lo = snap((16 < nb_hi) ? 16 : nb_hi); + const int nb_opt = snap((nb_hi < 64) ? nb_hi : 64); const int nb_cur = pv.nb; const char* issue = nullptr; From a45621a2efdb2047481a01db4d24f268b7ca8459 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 17:07:39 +0800 Subject: [PATCH 6/8] docs(lcao): condense nb2d load-balance comments Trim the verbose explanatory comments in the nb2d block to the essential rationale; no code change. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_lcao/LCAO_init_basis.cpp | 77 +++++++------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp index 9b074fe9fc7..3ed0181cb1f 100644 --- a/source/source_lcao/LCAO_init_basis.cpp +++ b/source/source_lcao/LCAO_init_basis.cpp @@ -82,36 +82,18 @@ void init_basis_lcao(Parallel_Orbitals& pv, // ---- nb2d (ScaLAPACK 2D block-cyclic block size) load-balance check ---- // ScaLAPACK diagonalizes the N x N matrix (N = nlocal) on a p x q process grid - // (p <= q) with square block size nb2d. With kpar k-point pools the diagonalization - // runs *per pool* on NPROC/kpar processes (Parallel_K2D::P2D_pool, whose block size - // is ParaV->get_block_size() == pv.nb), so the effective grid is the near-square - // factorization of NPROC/kpar -- exactly how Parallel_2D builds it. The long edge q - // governs load balance: - // B = N / (nb2d * q) blocks owned per process along q. - // B <= 1 : one process owns a whole panel while others idle -> a catastrophic - // load-imbalance "cliff" (nb2d too large; energy unaffected, time blows up). - // But nb2d also must not be too SMALL: blocks below ~16 lose BLAS/GEMM efficiency and - // explode block-cyclic communication (nb=1 = the slow "over-scatter" end). So the time- - // vs-nb2d curve is U-shaped and the healthy window is two-sided: - // nb_lo = min(16, N/(2q)) <= nb2d <= nb_hi = floor(N/(2q)) - // recommended nb2d = min(64, nb_hi) (largest balanced block, capped for BLAS). - // This block-size U-curve is a property of the ScaLAPACK 2D block-cyclic *dense* - // diagonalization, so the check is restricted to ks_solver=scalapack_gvx. (genelpa/ - // elpa do their own internal block tuning; lapack/cusolver/pexsi do not diagonalize - // on this distributed 2D grid, so the nb2d cliff does not apply to them.) + // (p <= q) with square block nb2d. The time-vs-nb2d curve is U-shaped: too large + // -> load-imbalance cliff (one process owns a whole panel); too small -> poor BLAS + // and heavy block-cyclic communication. Healthy window [nb_lo, nb_hi] below. + // Only scalapack_gvx diagonalizes on this 2D grid (genelpa/elpa tune internally; + // lapack/cusolver/pexsi do not), so the check is restricted to it. if (PARAM.inp.ks_solver == "scalapack_gvx") { const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1; - // Processes that actually run ONE (per-pool) diagonalization: - // kpar == 1 : the full diagonalization grid IS this ParaV grid (built on - // DIAG_WORLD), so use pv.dim0 * pv.dim1 directly. - // kpar > 1 : hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools - // (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)), NOT the - // DIAG_WORLD grid -- each pool owns NPROC/kpar ranks. Deriving - // the pool size from pv.dim0*pv.dim1/kpar would be wrong whenever - // diago_proc < NPROC (then DIAG_WORLD != NPROC). When NPROC is - // not divisible by kpar the pools are uneven (no single size), - // so skip the purely advisory heuristic. + // Processes running one (per-pool) diagonalization: + // kpar == 1 : the grid is this ParaV grid (built on DIAG_WORLD) -> pv.dim0*pv.dim1. + // kpar > 1 : hsolver re-splits MPI_COMM_WORLD into kpar pools of NPROC/kpar ranks + // (not the DIAG_WORLD grid). Uneven pools (NPROC % kpar != 0) are skipped. int np_pool = 0; // processes per pool (0 => skip the check) if (kpar <= 1) { @@ -128,22 +110,11 @@ void init_basis_lcao(Parallel_Orbitals& pv, while (p_row > 1 && np_pool % p_row != 0) { --p_row; } const int p_col = np_pool / p_row; // long edge q (>= p_row) - // Healthy block-size window is two-sided (the nb2d-vs-time curve is U-shaped): - // nb_hi = floor(N / (2*q)) upper bound -- keep >= 2 blocks per process - // (B >= 2); larger nb -> load imbalance "cliff". - // nb_lo = min(16, nb_hi) lower bound -- blocks below ~16 lose BLAS/GEMM - // efficiency (1x1 ops) and explode block-cyclic - // communication (panel count ~ N/nb); nb=1 is the - // slow "over-scatter" end. Capped by nb_hi because a - // tiny system on many processes cannot afford large - // blocks (then balance wins and nb_lo == nb_hi). - // recommended = the largest balanced block, capped at 64 for BLAS efficiency. - // For noncollinear nspin==4 the basis carries 2-component spinors that must - // stay paired inside one ScaLAPACK block, so the block size must be a multiple - // of 2 -- this is exactly why the autoset above and the fallback use nb2d=2 (not - // 1) for nspin==4. Snap the whole window to that granularity so the recommended - // value can never break the spinor blocking (an odd nb2d segfaults the nspin==4 - // diagonalization). + // Two-sided window: nb_hi = floor(N/2q) keeps >= 2 blocks per process; + // nb_lo = min(16, nb_hi) avoids tiny blocks; recommended = min(64, nb_hi). + // nspin==4 carries 2-component spinors that must stay paired in one block + // (hence autoset/fallback use nb2d=2, not 1), so snap the window to a multiple + // of 2 -- an odd nb2d would break the spinor blocking and segfault. const int nb_unit = (PARAM.inp.nspin == 4) ? 2 : 1; auto snap = [nb_unit](int v) { v = v / nb_unit * nb_unit; return v < nb_unit ? nb_unit : v; }; const int nb_hi = snap((nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1); @@ -161,12 +132,8 @@ void init_basis_lcao(Parallel_Orbitals& pv, issue = "too small -> over-scatter (poor BLAS efficiency and heavy communication)"; } - // Two cases, both reported via ofs_warning: - // (1) user explicitly set nb2d (PARAM.inp.nb2d != 0): respect it -- do NOT - // change the value -- but warn so the issue is visible. - // (2) auto nb2d (PARAM.inp.nb2d == 0): correct it to nb_opt. pv.nb feeds both - // the kpar==1 path and Parallel_K2D (ParaV->get_block_size()), so - // resetting it here also fixes the per-pool diagonalization. + // user-set nb2d (!=0): keep the value, only warn. auto nb2d (==0): correct it + // to nb_opt (pv.nb feeds both the kpar==1 path and the per-pool Parallel_K2D). if (issue != nullptr) { if (PARAM.inp.nb2d != 0) @@ -177,14 +144,10 @@ void init_basis_lcao(Parallel_Orbitals& pv, } else { - // Re-distribute with the recommended block size, but validate it the - // same way the initial distribution is validated above: set_nloc_wfc_Eij - // returns non-zero when nb_opt is incompatible with the band/grid layout - // (e.g. ceil(nbands/nb_opt) < process-grid width). nb_opt is derived from - // the matrix dimension and grid alone, so it can violate that band - // constraint on small systems. If it does, revert to the previously - // validated nb2d -- leaving pv half-updated (new nb/nrow but stale - // band sizes) would otherwise crash the later wavefunction setup. + // Validate nb_opt like the initial distribution: set_nloc_wfc_Eij + // returns non-zero if it is incompatible with the band/grid layout + // (ceil(nbands/nb_opt) < grid width). If so, revert to the validated + // nb_cur -- a half-updated pv would crash the later wavefunction setup. int retry = pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt); retry += pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning); if (retry != 0) From b64b30ade686d2e3c017147f2cda49b301a170d3 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 17:17:29 +0800 Subject: [PATCH 7/8] revert(io): drop kpar advisory from print_info.cpp Remove the kpar recommendation added earlier; this PR now scopes to the nb2d auto-recommendation in LCAO_init_basis.cpp only. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_io/module_output/print_info.cpp | 35 ++----------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp index 620f7bc7b10..afc49757bfe 100644 --- a/source/source_io/module_output/print_info.cpp +++ b/source/source_io/module_output/print_info.cpp @@ -96,7 +96,9 @@ void print_parameters( } else { - std::cout << std::setw(16) << kv.get_nkstot(); + const int nkstot = kv.get_nkstot(); + const int nkpoints_real = (inp.nspin == 2) ? (nkstot / 2) : nkstot; + std::cout << std::setw(16) << nkpoints_real; } std::cout << std::setw(12) << GlobalV::NPROC @@ -412,37 +414,6 @@ void print_kpar(const int &nks, const int &kpar_lcao) "%%%%%%%%%%%%\n"; } } - - // 16) recommend the optimal kpar for k-point parallelism. - // kpar splits the processes into pools, each diagonalizing a subset of the nks - // k-points independently -> near-linear speedup of the k-point loop. The ceiling is - // the number of k-points, so the optimal kpar is the largest divisor of NPROC that - // does not exceed nks (more pools than k-points leaves pools idle). Perfect balance - // additionally wants nks % kpar == 0 (see the warning above); very large systems may - // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization. - // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here. - // start the downward divisor scan at min(nks, NPROC) so we skip the d > nks - // candidates outright instead of iterating all of [1, NPROC]. - int kpar_opt = 1; - for (int d = (nks < GlobalV::NPROC) ? nks : GlobalV::NPROC; d >= 1; --d) - { - if (GlobalV::NPROC % d == 0) - { - kpar_opt = d; - break; - } - } - if (kpar_opt != kpar_lcao) - { - // Advisory only: many kpar choices are deliberate (e.g. keeping more ranks - // per pool for the per-pool diagonalization), so this is an info-level note - // rather than a WARNING. The genuinely problematic cases (nks not divisible - // by kpar, or kpar > nks leaving idle pools) are already flagged above. - GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = " - << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = " - << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize" - << " the k-point loop.\n"; - } } } // namespace ModuleIO From 8fa649d2637568c3f094a02a8480baa2fefdc719 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 17:18:29 +0800 Subject: [PATCH 8/8] revert(io): restore print_info.cpp to baseline (drop from PR) Match the merge-base so this PR no longer touches print_info.cpp at all; develop's own later edits to this file are left intact for the merge. Co-Authored-By: Claude Opus 4.8 (1M context) --- source/source_io/module_output/print_info.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp index afc49757bfe..398cbb49a8f 100644 --- a/source/source_io/module_output/print_info.cpp +++ b/source/source_io/module_output/print_info.cpp @@ -96,9 +96,7 @@ void print_parameters( } else { - const int nkstot = kv.get_nkstot(); - const int nkpoints_real = (inp.nspin == 2) ? (nkstot / 2) : nkstot; - std::cout << std::setw(16) << nkpoints_real; + std::cout << std::setw(16) << kv.get_nkstot(); } std::cout << std::setw(12) << GlobalV::NPROC