From 1c09df2777f6738c4e2413241af0a57b7fe2fdbd Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 12:16:39 +0800
Subject: [PATCH 1/8] feat(lcao): auto-recommend nb2d from ScaLAPACK 2D
 block-cyclic load balance

Derive a healthy nb2d window from the per-pool process grid (q) and matrix dim N=nlocal: nb_lo=min(16,N/(2q)) <= nb2d <= nb_hi=floor(N/(2q)), recommended nb2d=min(64,nb_hi). Avoids both the large-block load-imbalance cliff and the nb=1 over-scatter end of the U-shaped time-vs-nb2d curve. Prints the recommendation in the run-info banner; energy is unaffected (performance-only).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_io/module_output/print_info.cpp | 27 ++++++
 source/source_lcao/LCAO_init_basis.cpp        | 83 +++++++++++++++++++
 2 files changed, 110 insertions(+)
diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp
index 398cbb49a8f..6aff116257b 100644
--- a/source/source_io/module_output/print_info.cpp
+++ b/source/source_io/module_output/print_info.cpp
@@ -412,6 +412,33 @@ void print_kpar(const int &nks, const int &kpar_lcao)
                          "%%%%%%%%%%%%\n";
         }
     }
+
+    // 16) recommend the optimal kpar for k-point parallelism.
+    // kpar splits the processes into pools, each diagonalizing a subset of the nks
+    // k-points independently -> near-linear speedup of the k-point loop. The ceiling is
+    // the number of k-points, so the optimal kpar is the largest divisor of NPROC that
+    // does not exceed nks (more pools than k-points leaves pools idle). Perfect balance
+    // additionally wants nks % kpar == 0 (see the warning above); very large systems may
+    // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization.
+    // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here.
+    int kpar_opt = 1;
+    for (int d = GlobalV::NPROC; d >= 1; --d)
+    {
+        if (GlobalV::NPROC % d == 0 && d <= nks)
+        {
+            kpar_opt = d;
+            break;
+        }
+    }
+    if (kpar_opt != kpar_lcao)
+    {
+        GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = "
+                             << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = "
+                             << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize"
+                             << " the k-point loop.\n";
+        ModuleBase::WARNING("ModuleIO::print_kpar",
+                            "kpar is not optimal; see running log for the recommended value.");
+    }
 }
 
 } // namespace ModuleIO
diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index c18e859599a..743ca8e80d7 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -3,6 +3,8 @@
 #include "source_io/module_parameter/parameter.h"
 #include "source_base/parallel_comm.h"
 
+#include <cmath>
+
 namespace LCAO_domain
 {
 
@@ -78,6 +80,87 @@ void init_basis_lcao(Parallel_Orbitals& pv,
         try_nb = pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
     }
 
+    // ---- nb2d (ScaLAPACK 2D block-cyclic block size) load-balance check ----
+    // ScaLAPACK diagonalizes the N x N matrix (N = nlocal) on a p x q process grid
+    // (p <= q) with square block size nb2d. With kpar k-point pools the diagonalization
+    // runs *per pool* on NPROC/kpar processes (Parallel_K2D::P2D_pool, whose block size
+    // is ParaV->get_block_size() == pv.nb), so the effective grid is the near-square
+    // factorization of NPROC/kpar -- exactly how Parallel_2D builds it. The long edge q
+    // governs load balance:
+    //   B = N / (nb2d * q)   blocks owned per process along q.
+    //   B <= 1 : one process owns a whole panel while others idle -> a catastrophic
+    //            load-imbalance "cliff" (nb2d too large; energy unaffected, time blows up).
+    // But nb2d also must not be too SMALL: blocks below ~16 lose BLAS/GEMM efficiency and
+    // explode block-cyclic communication (nb=1 = the slow "over-scatter" end). So the time-
+    // vs-nb2d curve is U-shaped and the healthy window is two-sided:
+    //     nb_lo = min(16, N/(2q))  <=  nb2d  <=  nb_hi = floor(N/(2q))
+    // recommended nb2d = min(64, nb_hi) (largest balanced block, capped for BLAS).
+    // This block-size U-curve is a property of the ScaLAPACK 2D block-cyclic *dense*
+    // diagonalization, so the check is restricted to ks_solver=scalapack_gvx. (genelpa/
+    // elpa do their own internal block tuning; lapack/cusolver/pexsi do not diagonalize
+    // on this distributed 2D grid, so the nb2d cliff does not apply to them.)
+    if (PARAM.inp.ks_solver == "scalapack_gvx")
+    {
+        const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1;
+        const int np_total = pv.dim0 * pv.dim1;   // diagonalization world size
+        const int np_pool = (kpar > 0) ? np_total / kpar : np_total; // processes per pool
+        if (np_pool > 1 && nlocal > 0)
+        {
+            // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D)
+            int p_row = static_cast<int>(std::sqrt(np_pool + 0.5));
+            while (p_row > 1 && np_pool % p_row != 0) { --p_row; }
+            const int p_col = np_pool / p_row;    // long edge q (>= p_row)
+
+            // Healthy block-size window is two-sided (the nb2d-vs-time curve is U-shaped):
+            //   nb_hi = floor(N / (2*q))  upper bound -- keep >= 2 blocks per process
+            //                             (B >= 2); larger nb -> load imbalance "cliff".
+            //   nb_lo = min(16, nb_hi)    lower bound -- blocks below ~16 lose BLAS/GEMM
+            //                             efficiency (1x1 ops) and explode block-cyclic
+            //                             communication (panel count ~ N/nb); nb=1 is the
+            //                             slow "over-scatter" end. Capped by nb_hi because a
+            //                             tiny system on many processes cannot afford large
+            //                             blocks (then balance wins and nb_lo == nb_hi).
+            // recommended = the largest balanced block, capped at 64 for BLAS efficiency.
+            const int nb_hi = (nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1;
+            const int nb_lo = (16 < nb_hi) ? 16 : nb_hi;
+            const int nb_opt = (nb_hi < 64) ? nb_hi : 64;
+            const int nb_cur = pv.nb;
+
+            const char* issue = nullptr;
+            if (nb_cur > nb_hi)
+            {
+                issue = "too large -> ScaLAPACK load imbalance (one process owns a whole panel)";
+            }
+            else if (nb_cur < nb_lo)
+            {
+                issue = "too small -> over-scatter (poor BLAS efficiency and heavy communication)";
+            }
+
+            // Two cases, both reported via ofs_warning:
+            //   (1) user explicitly set nb2d (PARAM.inp.nb2d != 0): respect it -- do NOT
+            //       change the value -- but warn so the issue is visible.
+            //   (2) auto nb2d (PARAM.inp.nb2d == 0): correct it to nb_opt. pv.nb feeds both
+            //       the kpar==1 path and Parallel_K2D (ParaV->get_block_size()), so
+            //       resetting it here also fixes the per-pool diagonalization.
+            if (issue != nullptr)
+            {
+                if (PARAM.inp.nb2d != 0)
+                {
+                    GlobalV::ofs_warning << "init_basis_lcao: user-set nb2d=" << nb_cur << " is " << issue
+                        << " for N=" << nlocal << ", kpar=" << kpar << " (per-pool grid " << p_row << "x"
+                        << p_col << "); recommended nb2d=" << nb_opt << " (user value kept, not changed).\n";
+                }
+                else
+                {
+                    GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue
+                        << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n";
+                    pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt);
+                    pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
+                }
+            }
+        }
+    }
+
     // init blacs context for genelpa
     pv.set_desc_wfc_Eij(nlocal, PARAM.inp.nbands, pv.nrow);
 

From 92e2b1e854908e023c5ce02a5f9ddb9f353cd078 Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 12:52:30 +0800
Subject: [PATCH 2/8] fix(lcao): address Copilot review on nb2d/kpar advisory

- LCAO_init_basis: skip the nb2d load-balance heuristic when NPROC is not
  divisible by kpar (pool size ill-defined) instead of using a truncated
  np_total/kpar that yields a wrong process-grid estimate.
- print_info: start the kpar divisor scan at min(nks, NPROC) so out-of-range
  candidates are skipped rather than scanning all of [1, NPROC].
- print_info: drop the ModuleBase::WARNING for a non-optimal kpar -- it is
  advisory only and the truly problematic cases (nks % kpar != 0, kpar > nks)
  are already warned about above. Keep the running-log advisory line.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_io/module_output/print_info.cpp | 12 ++++++++----
 source/source_lcao/LCAO_init_basis.cpp        |  6 +++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp
index 6aff116257b..620f7bc7b10 100644
--- a/source/source_io/module_output/print_info.cpp
+++ b/source/source_io/module_output/print_info.cpp
@@ -421,10 +421,12 @@ void print_kpar(const int &nks, const int &kpar_lcao)
     // additionally wants nks % kpar == 0 (see the warning above); very large systems may
     // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization.
     // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here.
+    // start the downward divisor scan at min(nks, NPROC) so we skip the d > nks
+    // candidates outright instead of iterating all of [1, NPROC].
     int kpar_opt = 1;
-    for (int d = GlobalV::NPROC; d >= 1; --d)
+    for (int d = (nks < GlobalV::NPROC) ? nks : GlobalV::NPROC; d >= 1; --d)
     {
-        if (GlobalV::NPROC % d == 0 && d <= nks)
+        if (GlobalV::NPROC % d == 0)
         {
             kpar_opt = d;
             break;
@@ -432,12 +434,14 @@ void print_kpar(const int &nks, const int &kpar_lcao)
     }
     if (kpar_opt != kpar_lcao)
     {
+        // Advisory only: many kpar choices are deliberate (e.g. keeping more ranks
+        // per pool for the per-pool diagonalization), so this is an info-level note
+        // rather than a WARNING. The genuinely problematic cases (nks not divisible
+        // by kpar, or kpar > nks leaving idle pools) are already flagged above.
         GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = "
                              << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = "
                              << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize"
                              << " the k-point loop.\n";
-        ModuleBase::WARNING("ModuleIO::print_kpar",
-                            "kpar is not optimal; see running log for the recommended value.");
     }
 }
 
diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index 743ca8e80d7..c64958c35ad 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -103,7 +103,11 @@ void init_basis_lcao(Parallel_Orbitals& pv,
     {
         const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1;
         const int np_total = pv.dim0 * pv.dim1;   // diagonalization world size
-        const int np_pool = (kpar > 0) ? np_total / kpar : np_total; // processes per pool
+        // Pools split the world evenly; if np_total is not divisible by kpar the
+        // pool size is ill-defined, so skip the (purely advisory) heuristic rather
+        // than estimate the grid from a truncated np_total/kpar.
+        const bool pool_well_defined = (kpar > 0 && np_total % kpar == 0);
+        const int np_pool = pool_well_defined ? np_total / kpar : 0; // processes per pool
         if (np_pool > 1 && nlocal > 0)
         {
             // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D)

From 81fa44a8c9860f892936a4342986a480c1b46499 Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 13:30:45 +0800
Subject: [PATCH 3/8] fix(lcao): validate auto-adjusted nb2d, revert if
 band/grid-incompatible

The nb2d auto-adjust ignored the return value of set_nloc_wfc_Eij. When the
recommended nb_opt (derived from matrix dim + grid only) makes
ceil(nbands/nb_opt) smaller than the process-grid width, set_nloc_wfc_Eij
returns early without updating ncol_bands/nrow_bands/nloc_wfc, leaving pv
half-updated (new nb/nrow from pv.set, stale band sizes). This segfaulted the
gamma-only scalapack_gvx cases (02_NAO_Gamma: scf_bsse, scf_force_stress,
scf_out_hk, scf_out_hxc; N=26, nbands=6, 2x2 grid -> nb_opt=6 -> block=1<dim1).

Validate the new block size like the initial distribution and revert to the
previously validated nb2d on failure.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_lcao/LCAO_init_basis.cpp | 27 ++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index c64958c35ad..c8f1a005558 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -156,10 +156,29 @@ void init_basis_lcao(Parallel_Orbitals& pv,
                 }
                 else
                 {
-                    GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue
-                        << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n";
-                    pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt);
-                    pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
+                    // Re-distribute with the recommended block size, but validate it the
+                    // same way the initial distribution is validated above: set_nloc_wfc_Eij
+                    // returns non-zero when nb_opt is incompatible with the band/grid layout
+                    // (e.g. ceil(nbands/nb_opt) < process-grid width). nb_opt is derived from
+                    // the matrix dimension and grid alone, so it can violate that band
+                    // constraint on small systems. If it does, revert to the previously
+                    // validated nb2d -- leaving pv half-updated (new nb/nrow but stale
+                    // band sizes) would otherwise crash the later wavefunction setup.
+                    int retry = pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt);
+                    retry += pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
+                    if (retry != 0)
+                    {
+                        pv.set(nlocal, nlocal, nb_cur, pv.blacs_ctxt);
+                        pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
+                        GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue
+                            << " for N=" << nlocal << ", kpar=" << kpar << "; recommended nb2d=" << nb_opt
+                            << " is incompatible with the band/grid layout, so nb2d=" << nb_cur << " is kept.\n";
+                    }
+                    else
+                    {
+                        GlobalV::ofs_warning << "init_basis_lcao: auto nb2d=" << nb_cur << " is " << issue
+                            << " for N=" << nlocal << ", kpar=" << kpar << "; auto-adjusted to nb2d=" << nb_opt << ".\n";
+                    }
                 }
             }
         }

From 2ddf0de2a7dd1f33b6d1f7e80e6c26feb81646dd Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 13:39:26 +0800
Subject: [PATCH 4/8] fix(lcao): derive per-pool size from the real pool
 communicator, not pv grid

The nb2d load-balance check estimated the per-pool process count as
pv.dim0 * pv.dim1 / kpar. But pv is built on DIAG_WORLD (size = diago_proc),
whereas hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools of
NPROC/kpar ranks each (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)),
independent of diago_proc. So whenever diago_proc < NPROC the old expression
gave diago_proc/kpar != NPROC/kpar -- a wrong (p_row, p_col) grid and hence a
wrong nb_hi/nb_lo window and auto-adjust decision.

Take the pool size from the actual source instead:
  - kpar == 1: the diagonalization runs on this ParaV grid -> pv.dim0 * pv.dim1.
  - kpar  > 1: NPROC / kpar (the parakSolve pool size); skip when NPROC % kpar
               != 0 since the pools are then uneven and have no single size.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_lcao/LCAO_init_basis.cpp | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index c8f1a005558..1261ae06d8d 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -102,12 +102,25 @@ void init_basis_lcao(Parallel_Orbitals& pv,
     if (PARAM.inp.ks_solver == "scalapack_gvx")
     {
         const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1;
-        const int np_total = pv.dim0 * pv.dim1;   // diagonalization world size
-        // Pools split the world evenly; if np_total is not divisible by kpar the
-        // pool size is ill-defined, so skip the (purely advisory) heuristic rather
-        // than estimate the grid from a truncated np_total/kpar.
-        const bool pool_well_defined = (kpar > 0 && np_total % kpar == 0);
-        const int np_pool = pool_well_defined ? np_total / kpar : 0; // processes per pool
+        // Processes that actually run ONE (per-pool) diagonalization:
+        //   kpar == 1 : the full diagonalization grid IS this ParaV grid (built on
+        //               DIAG_WORLD), so use pv.dim0 * pv.dim1 directly.
+        //   kpar  > 1 : hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools
+        //               (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)), NOT the
+        //               DIAG_WORLD grid -- each pool owns NPROC/kpar ranks. Deriving
+        //               the pool size from pv.dim0*pv.dim1/kpar would be wrong whenever
+        //               diago_proc < NPROC (then DIAG_WORLD != NPROC). When NPROC is
+        //               not divisible by kpar the pools are uneven (no single size),
+        //               so skip the purely advisory heuristic.
+        int np_pool = 0; // processes per pool (0 => skip the check)
+        if (kpar <= 1)
+        {
+            np_pool = pv.dim0 * pv.dim1;
+        }
+        else if (GlobalV::NPROC % kpar == 0)
+        {
+            np_pool = GlobalV::NPROC / kpar;
+        }
         if (np_pool > 1 && nlocal > 0)
         {
             // near-square factorization np_pool = p * q, p <= q (matches Parallel_2D)

From 9d8160a786ab251bfb763529c9e6e1d638f59e1f Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 14:18:51 +0800
Subject: [PATCH 5/8] fix(lcao): keep auto-recommended nb2d even for
 noncollinear nspin==4

For nspin==4 the basis carries 2-component spinors that must stay paired
within a ScaLAPACK block, so nb2d must be a multiple of 2 (the autoset and
fallback already use 2, not 1, for this reason). The nb2d recommendation
nb_opt = min(64, floor(N/2q)) could be odd (e.g. N=108, q=2 -> 27), which
broke the spinor blocking and segfaulted the nspin==4 diagonalization
(03_NAO_multik: scf_u_spin4, scf_out_dos_spin4, scf_out_mul_spin4).

Snap the whole nb2d window (nb_lo/nb_hi/nb_opt) to the nspin granularity.

Verified locally in the abacus-gnu Docker image: 02_NAO_Gamma, 03_NAO_multik
and 05_rtTDDFT integration suites all pass 100%.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_lcao/LCAO_init_basis.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index 1261ae06d8d..9b074fe9fc7 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -138,9 +138,17 @@ void init_basis_lcao(Parallel_Orbitals& pv,
             //                             tiny system on many processes cannot afford large
             //                             blocks (then balance wins and nb_lo == nb_hi).
             // recommended = the largest balanced block, capped at 64 for BLAS efficiency.
-            const int nb_hi = (nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1;
-            const int nb_lo = (16 < nb_hi) ? 16 : nb_hi;
-            const int nb_opt = (nb_hi < 64) ? nb_hi : 64;
+            // For noncollinear nspin==4 the basis carries 2-component spinors that must
+            // stay paired inside one ScaLAPACK block, so the block size must be a multiple
+            // of 2 -- this is exactly why the autoset above and the fallback use nb2d=2 (not
+            // 1) for nspin==4. Snap the whole window to that granularity so the recommended
+            // value can never break the spinor blocking (an odd nb2d segfaults the nspin==4
+            // diagonalization).
+            const int nb_unit = (PARAM.inp.nspin == 4) ? 2 : 1;
+            auto snap = [nb_unit](int v) { v = v / nb_unit * nb_unit; return v < nb_unit ? nb_unit : v; };
+            const int nb_hi = snap((nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1);
+            const int nb_lo = snap((16 < nb_hi) ? 16 : nb_hi);
+            const int nb_opt = snap((nb_hi < 64) ? nb_hi : 64);
             const int nb_cur = pv.nb;
 
             const char* issue = nullptr;

From a45621a2efdb2047481a01db4d24f268b7ca8459 Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 17:07:39 +0800
Subject: [PATCH 6/8] docs(lcao): condense nb2d load-balance comments

Trim the verbose explanatory comments in the nb2d block to the essential
rationale; no code change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_lcao/LCAO_init_basis.cpp | 77 +++++++-------------------
 1 file changed, 20 insertions(+), 57 deletions(-)

diff --git a/source/source_lcao/LCAO_init_basis.cpp b/source/source_lcao/LCAO_init_basis.cpp
index 9b074fe9fc7..3ed0181cb1f 100644
--- a/source/source_lcao/LCAO_init_basis.cpp
+++ b/source/source_lcao/LCAO_init_basis.cpp
@@ -82,36 +82,18 @@ void init_basis_lcao(Parallel_Orbitals& pv,
 
     // ---- nb2d (ScaLAPACK 2D block-cyclic block size) load-balance check ----
     // ScaLAPACK diagonalizes the N x N matrix (N = nlocal) on a p x q process grid
-    // (p <= q) with square block size nb2d. With kpar k-point pools the diagonalization
-    // runs *per pool* on NPROC/kpar processes (Parallel_K2D::P2D_pool, whose block size
-    // is ParaV->get_block_size() == pv.nb), so the effective grid is the near-square
-    // factorization of NPROC/kpar -- exactly how Parallel_2D builds it. The long edge q
-    // governs load balance:
-    //   B = N / (nb2d * q)   blocks owned per process along q.
-    //   B <= 1 : one process owns a whole panel while others idle -> a catastrophic
-    //            load-imbalance "cliff" (nb2d too large; energy unaffected, time blows up).
-    // But nb2d also must not be too SMALL: blocks below ~16 lose BLAS/GEMM efficiency and
-    // explode block-cyclic communication (nb=1 = the slow "over-scatter" end). So the time-
-    // vs-nb2d curve is U-shaped and the healthy window is two-sided:
-    //     nb_lo = min(16, N/(2q))  <=  nb2d  <=  nb_hi = floor(N/(2q))
-    // recommended nb2d = min(64, nb_hi) (largest balanced block, capped for BLAS).
-    // This block-size U-curve is a property of the ScaLAPACK 2D block-cyclic *dense*
-    // diagonalization, so the check is restricted to ks_solver=scalapack_gvx. (genelpa/
-    // elpa do their own internal block tuning; lapack/cusolver/pexsi do not diagonalize
-    // on this distributed 2D grid, so the nb2d cliff does not apply to them.)
+    // (p <= q) with square block nb2d. The time-vs-nb2d curve is U-shaped: too large
+    // -> load-imbalance cliff (one process owns a whole panel); too small -> poor BLAS
+    // and heavy block-cyclic communication. Healthy window [nb_lo, nb_hi] below.
+    // Only scalapack_gvx diagonalizes on this 2D grid (genelpa/elpa tune internally;
+    // lapack/cusolver/pexsi do not), so the check is restricted to it.
     if (PARAM.inp.ks_solver == "scalapack_gvx")
     {
         const int kpar = (PARAM.globalv.kpar_lcao > 0) ? PARAM.globalv.kpar_lcao : 1;
-        // Processes that actually run ONE (per-pool) diagonalization:
-        //   kpar == 1 : the full diagonalization grid IS this ParaV grid (built on
-        //               DIAG_WORLD), so use pv.dim0 * pv.dim1 directly.
-        //   kpar  > 1 : hsolver's parakSolve re-splits MPI_COMM_WORLD into kpar pools
-        //               (Parallel_K2D -> divide_mpi_groups(NPROC, kpar)), NOT the
-        //               DIAG_WORLD grid -- each pool owns NPROC/kpar ranks. Deriving
-        //               the pool size from pv.dim0*pv.dim1/kpar would be wrong whenever
-        //               diago_proc < NPROC (then DIAG_WORLD != NPROC). When NPROC is
-        //               not divisible by kpar the pools are uneven (no single size),
-        //               so skip the purely advisory heuristic.
+        // Processes running one (per-pool) diagonalization:
+        //   kpar == 1 : the grid is this ParaV grid (built on DIAG_WORLD) -> pv.dim0*pv.dim1.
+        //   kpar  > 1 : hsolver re-splits MPI_COMM_WORLD into kpar pools of NPROC/kpar ranks
+        //               (not the DIAG_WORLD grid). Uneven pools (NPROC % kpar != 0) are skipped.
         int np_pool = 0; // processes per pool (0 => skip the check)
         if (kpar <= 1)
         {
@@ -128,22 +110,11 @@ void init_basis_lcao(Parallel_Orbitals& pv,
             while (p_row > 1 && np_pool % p_row != 0) { --p_row; }
             const int p_col = np_pool / p_row;    // long edge q (>= p_row)
 
-            // Healthy block-size window is two-sided (the nb2d-vs-time curve is U-shaped):
-            //   nb_hi = floor(N / (2*q))  upper bound -- keep >= 2 blocks per process
-            //                             (B >= 2); larger nb -> load imbalance "cliff".
-            //   nb_lo = min(16, nb_hi)    lower bound -- blocks below ~16 lose BLAS/GEMM
-            //                             efficiency (1x1 ops) and explode block-cyclic
-            //                             communication (panel count ~ N/nb); nb=1 is the
-            //                             slow "over-scatter" end. Capped by nb_hi because a
-            //                             tiny system on many processes cannot afford large
-            //                             blocks (then balance wins and nb_lo == nb_hi).
-            // recommended = the largest balanced block, capped at 64 for BLAS efficiency.
-            // For noncollinear nspin==4 the basis carries 2-component spinors that must
-            // stay paired inside one ScaLAPACK block, so the block size must be a multiple
-            // of 2 -- this is exactly why the autoset above and the fallback use nb2d=2 (not
-            // 1) for nspin==4. Snap the whole window to that granularity so the recommended
-            // value can never break the spinor blocking (an odd nb2d segfaults the nspin==4
-            // diagonalization).
+            // Two-sided window: nb_hi = floor(N/2q) keeps >= 2 blocks per process;
+            // nb_lo = min(16, nb_hi) avoids tiny blocks; recommended = min(64, nb_hi).
+            // nspin==4 carries 2-component spinors that must stay paired in one block
+            // (hence autoset/fallback use nb2d=2, not 1), so snap the window to a multiple
+            // of 2 -- an odd nb2d would break the spinor blocking and segfault.
             const int nb_unit = (PARAM.inp.nspin == 4) ? 2 : 1;
             auto snap = [nb_unit](int v) { v = v / nb_unit * nb_unit; return v < nb_unit ? nb_unit : v; };
             const int nb_hi = snap((nlocal >= 2 * p_col) ? nlocal / (2 * p_col) : 1);
@@ -161,12 +132,8 @@ void init_basis_lcao(Parallel_Orbitals& pv,
                 issue = "too small -> over-scatter (poor BLAS efficiency and heavy communication)";
             }
 
-            // Two cases, both reported via ofs_warning:
-            //   (1) user explicitly set nb2d (PARAM.inp.nb2d != 0): respect it -- do NOT
-            //       change the value -- but warn so the issue is visible.
-            //   (2) auto nb2d (PARAM.inp.nb2d == 0): correct it to nb_opt. pv.nb feeds both
-            //       the kpar==1 path and Parallel_K2D (ParaV->get_block_size()), so
-            //       resetting it here also fixes the per-pool diagonalization.
+            // user-set nb2d (!=0): keep the value, only warn. auto nb2d (==0): correct it
+            // to nb_opt (pv.nb feeds both the kpar==1 path and the per-pool Parallel_K2D).
             if (issue != nullptr)
             {
                 if (PARAM.inp.nb2d != 0)
@@ -177,14 +144,10 @@ void init_basis_lcao(Parallel_Orbitals& pv,
                 }
                 else
                 {
-                    // Re-distribute with the recommended block size, but validate it the
-                    // same way the initial distribution is validated above: set_nloc_wfc_Eij
-                    // returns non-zero when nb_opt is incompatible with the band/grid layout
-                    // (e.g. ceil(nbands/nb_opt) < process-grid width). nb_opt is derived from
-                    // the matrix dimension and grid alone, so it can violate that band
-                    // constraint on small systems. If it does, revert to the previously
-                    // validated nb2d -- leaving pv half-updated (new nb/nrow but stale
-                    // band sizes) would otherwise crash the later wavefunction setup.
+                    // Validate nb_opt like the initial distribution: set_nloc_wfc_Eij
+                    // returns non-zero if it is incompatible with the band/grid layout
+                    // (ceil(nbands/nb_opt) < grid width). If so, revert to the validated
+                    // nb_cur -- a half-updated pv would crash the later wavefunction setup.
                     int retry = pv.set(nlocal, nlocal, nb_opt, pv.blacs_ctxt);
                     retry += pv.set_nloc_wfc_Eij(PARAM.inp.nbands, GlobalV::ofs_running, GlobalV::ofs_warning);
                     if (retry != 0)

From b64b30ade686d2e3c017147f2cda49b301a170d3 Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 17:17:29 +0800
Subject: [PATCH 7/8] revert(io): drop kpar advisory from print_info.cpp

Remove the kpar recommendation added earlier; this PR now scopes to the
nb2d auto-recommendation in LCAO_init_basis.cpp only.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_io/module_output/print_info.cpp | 35 ++-----------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp
index 620f7bc7b10..afc49757bfe 100644
--- a/source/source_io/module_output/print_info.cpp
+++ b/source/source_io/module_output/print_info.cpp
@@ -96,7 +96,9 @@ void print_parameters(
         }
         else
         {
-            std::cout << std::setw(16) << kv.get_nkstot();
+            const int nkstot = kv.get_nkstot();
+            const int nkpoints_real = (inp.nspin == 2) ? (nkstot / 2) : nkstot;
+            std::cout << std::setw(16) << nkpoints_real;
         }
 
         std::cout << std::setw(12) << GlobalV::NPROC
@@ -412,37 +414,6 @@ void print_kpar(const int &nks, const int &kpar_lcao)
                          "%%%%%%%%%%%%\n";
         }
     }
-
-    // 16) recommend the optimal kpar for k-point parallelism.
-    // kpar splits the processes into pools, each diagonalizing a subset of the nks
-    // k-points independently -> near-linear speedup of the k-point loop. The ceiling is
-    // the number of k-points, so the optimal kpar is the largest divisor of NPROC that
-    // does not exceed nks (more pools than k-points leaves pools idle). Perfect balance
-    // additionally wants nks % kpar == 0 (see the warning above); very large systems may
-    // prefer fewer pools to keep enough ranks per pool for the per-pool diagonalization.
-    // This is advisory only -- kpar fixes the MPI pool layout and cannot be changed here.
-    // start the downward divisor scan at min(nks, NPROC) so we skip the d > nks
-    // candidates outright instead of iterating all of [1, NPROC].
-    int kpar_opt = 1;
-    for (int d = (nks < GlobalV::NPROC) ? nks : GlobalV::NPROC; d >= 1; --d)
-    {
-        if (GlobalV::NPROC % d == 0)
-        {
-            kpar_opt = d;
-            break;
-        }
-    }
-    if (kpar_opt != kpar_lcao)
-    {
-        // Advisory only: many kpar choices are deliberate (e.g. keeping more ranks
-        // per pool for the per-pool diagonalization), so this is an info-level note
-        // rather than a WARNING. The genuinely problematic cases (nks not divisible
-        // by kpar, or kpar > nks leaving idle pools) are already flagged above.
-        GlobalV::ofs_running << " kpar advisory: current kpar = " << kpar_lcao << " (NPROC = "
-                             << GlobalV::NPROC << ", nks = " << nks << "). Recommended kpar = "
-                             << kpar_opt << " (largest divisor of NPROC <= nks) to parallelize"
-                             << " the k-point loop.\n";
-    }
 }
 
 } // namespace ModuleIO

From 8fa649d2637568c3f094a02a8480baa2fefdc719 Mon Sep 17 00:00:00 2001
From: liutao <A-006>
Date: Wed, 17 Jun 2026 17:18:29 +0800
Subject: [PATCH 8/8] revert(io): restore print_info.cpp to baseline (drop from
 PR)

Match the merge-base so this PR no longer touches print_info.cpp at all;
develop's own later edits to this file are left intact for the merge.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 source/source_io/module_output/print_info.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp
index afc49757bfe..398cbb49a8f 100644
--- a/source/source_io/module_output/print_info.cpp
+++ b/source/source_io/module_output/print_info.cpp
@@ -96,9 +96,7 @@ void print_parameters(
         }
         else
         {
-            const int nkstot = kv.get_nkstot();
-            const int nkpoints_real = (inp.nspin == 2) ? (nkstot / 2) : nkstot;
-            std::cout << std::setw(16) << nkpoints_real;
+            std::cout << std::setw(16) << kv.get_nkstot();
         }
 
         std::cout << std::setw(12) << GlobalV::NPROC