From a5929dc201b8dcac5ee413ab9cfdac7f5ae243be Mon Sep 17 00:00:00 2001
From: Phyzait Lu <i@phyzait.moe>
Date: Tue, 16 Jun 2026 01:47:57 +0800
Subject: [PATCH] bugfix: reuse explicit GPU IDs cyclically across MPI ranks

---
 src/align_tiltseries_runner.cpp |  8 +++-----
 src/align_tiltseries_runner.h   |  1 +
 src/amyloid_finder.cpp          |  2 +-
 src/amyloid_finder_mpi.cpp      |  4 ++--
 src/args.cpp                    | 28 ++++++++++++++++++++++++++++
 src/args.h                      | 10 ++++++++++
 src/autopicker.cpp              |  2 +-
 src/autopicker_mpi.cpp          |  4 ++--
 src/ml_optimiser.cpp            |  4 ++--
 src/ml_optimiser_mpi.cpp        |  6 +++---
 src/motioncorr_runner.cpp       |  8 +++-----
 src/motioncorr_runner.h         |  1 +
 12 files changed, 57 insertions(+), 21 deletions(-)
diff --git a/src/align_tiltseries_runner.cpp b/src/align_tiltseries_runner.cpp
index 93b172bc9..0a2b537e7 100644
--- a/src/align_tiltseries_runner.cpp
+++ b/src/align_tiltseries_runner.cpp
@@ -563,13 +563,11 @@ void AlignTiltseriesRunner::executeAreTomo(long idx_tomo, int rank)
 
     if (gpu_ids.length() > 0)
     {
-        if (rank >= allThreadIDs.size())
-            REPORT_ERROR("ERROR: not enough MPI nodes specified for the GPU IDs.");
-
         command += " -Gpu " ;
-        for (int igpu = 0; igpu < allThreadIDs[rank].size(); igpu++)
+        const std::vector<std::string> &rankThreadIDs = getDeviceIDsForRank(allThreadIDs, rank);
+        for (int igpu = 0; igpu < rankThreadIDs.size(); igpu++)
         {
-            command += allThreadIDs[rank][igpu] + " ";
+            command += rankThreadIDs[igpu] + " ";
         }
     }
 
diff --git a/src/align_tiltseries_runner.h b/src/align_tiltseries_runner.h
index bc926e844..9cc4013fd 100644
--- a/src/align_tiltseries_runner.h
+++ b/src/align_tiltseries_runner.h
@@ -27,6 +27,7 @@
 #include  <stdlib.h>
 #include  <unistd.h>
 #include  <stdio.h>
+#include "src/args.h"
 #include <src/metadata_table.h>
 #include <src/image.h>
 #include <src/time.h>
diff --git a/src/amyloid_finder.cpp b/src/amyloid_finder.cpp
index 1f1b238ba..fa68b0ce4 100644
--- a/src/amyloid_finder.cpp
+++ b/src/amyloid_finder.cpp
@@ -244,7 +244,7 @@ void AmyloidFinder::deviceInitialise()
 	untangleDeviceIDs(gpu_ids, allThreadIDs);
 
 	// Sequential initialisation of GPUs on all ranks
-	if (!std::isdigit(*gpu_ids.begin()))
+	if (!hasExplicitDeviceIDs(gpu_ids))
 		device_id = 0;
 	else
 		device_id = textToInteger((allThreadIDs[0][0]).c_str());
diff --git a/src/amyloid_finder_mpi.cpp b/src/amyloid_finder_mpi.cpp
index 18bb14a54..459e37808 100644
--- a/src/amyloid_finder_mpi.cpp
+++ b/src/amyloid_finder_mpi.cpp
@@ -53,10 +53,10 @@ void AmyloidFinderMpi::deviceInitialise()
 	untangleDeviceIDs(gpu_ids, allThreadIDs);
 
 	// Sequential initialisation of GPUs on all ranks
-	if (!std::isdigit(*gpu_ids.begin()))
+	if (!hasExplicitDeviceIDs(gpu_ids))
 		device_id = node->rank%devCount;
 	else
-		device_id = textToInteger((allThreadIDs[node->rank][0]).c_str());
+		device_id = textToInteger(getDeviceIDsForRank(allThreadIDs, node->rank)[0].c_str());
 
 	for (int follower = 0; follower < node->size; follower++)
 	{
diff --git a/src/args.cpp b/src/args.cpp
index c77a472e1..76e82d6c5 100644
--- a/src/args.cpp
+++ b/src/args.cpp
@@ -46,6 +46,7 @@
 #include "src/gcc_version.h"
 #include "src/matrix1d.h"
 #include <algorithm>
+#include <cctype>
 
 // Get parameters from the command line ====================================
 std::string getParameter(int argc, char **argv, const std::string param, const std::string option)
@@ -472,3 +473,30 @@ void untangleDeviceIDs(std::string &tangled, std::vector < std::vector < std::st
 #endif
 }
 
+bool hasExplicitDeviceIDs(const std::string &gpu_ids)
+{
+	return gpu_ids.size() > 0 && std::isdigit(static_cast<unsigned char>(gpu_ids[0]));
+}
+
+const std::vector<std::string>& getDeviceIDsForRank(
+	const std::vector < std::vector < std::string > > &allThreadIDs,
+	int rank)
+{
+	if (rank < 0)
+		REPORT_ERROR("Negative MPI rank requested for GPU ID assignment.");
+
+	if (allThreadIDs.size() == 0)
+		REPORT_ERROR("No GPU IDs parsed from --gpu.");
+
+	const std::vector<std::string> &rankThreadIDs = allThreadIDs[rank % allThreadIDs.size()];
+	if (rankThreadIDs.size() == 0)
+		REPORT_ERROR("No GPU IDs parsed for MPI rank from --gpu.");
+
+	for (int i = 0; i < rankThreadIDs.size(); i++)
+	{
+		if (rankThreadIDs[i] == "")
+			REPORT_ERROR("Empty GPU ID parsed from --gpu.");
+	}
+
+	return rankThreadIDs;
+}
diff --git a/src/args.h b/src/args.h
index 224294dd9..1c7f30182 100644
--- a/src/args.h
+++ b/src/args.h
@@ -218,4 +218,14 @@ class IOParser
  */
 void untangleDeviceIDs(std::string &tangled, std::vector < std::vector < std::string > > &untangled);
 
+bool hasExplicitDeviceIDs(const std::string &gpu_ids);
+
+/*
+ * Returns the device IDs for an MPI rank. If fewer rank-specific GPU ID groups
+ * are provided than MPI ranks, the groups are reused cyclically.
+ */
+const std::vector<std::string>& getDeviceIDsForRank(
+	const std::vector < std::vector < std::string > > &allThreadIDs,
+	int rank);
+
 #endif
diff --git a/src/autopicker.cpp b/src/autopicker.cpp
index f1c2fe467..b815f0e6a 100644
--- a/src/autopicker.cpp
+++ b/src/autopicker.cpp
@@ -967,7 +967,7 @@ void AutoPicker::deviceInitialise()
 	untangleDeviceIDs(gpu_ids, allThreadIDs);
 
 	// Sequential initialisation of GPUs on all ranks
-	if (!std::isdigit(*gpu_ids.begin()))
+	if (!hasExplicitDeviceIDs(gpu_ids))
 		device_id = 0;
 	else
 		device_id = textToInteger((allThreadIDs[0][0]).c_str());
diff --git a/src/autopicker_mpi.cpp b/src/autopicker_mpi.cpp
index 20d009b27..3db6cbb7b 100644
--- a/src/autopicker_mpi.cpp
+++ b/src/autopicker_mpi.cpp
@@ -54,10 +54,10 @@ void AutoPickerMpi::deviceInitialise()
 	untangleDeviceIDs(gpu_ids, allThreadIDs);
 
 	// Sequential initialisation of GPUs on all ranks
-	if (!std::isdigit(*gpu_ids.begin()))
+	if (!hasExplicitDeviceIDs(gpu_ids))
 		device_id = node->rank%devCount;
 	else
-		device_id = textToInteger((allThreadIDs[node->rank][0]).c_str());
+		device_id = textToInteger(getDeviceIDsForRank(allThreadIDs, node->rank)[0].c_str());
 
 	for (int follower = 0; follower < node->size; follower++)
 	{
diff --git a/src/ml_optimiser.cpp b/src/ml_optimiser.cpp
index 1179c6374..816f391dc 100644
--- a/src/ml_optimiser.cpp
+++ b/src/ml_optimiser.cpp
@@ -1608,7 +1608,7 @@ void MlOptimiser::initialise()
         // Sequential initialisation of GPUs on all ranks
         bool fullAutomaticMapping(true);
         bool semiAutomaticMapping(true);
-        if (allThreadIDs[0].size()==0 || (!std::isdigit(*gpu_ids.begin())) )
+        if (allThreadIDs[0].size()==0 || (!hasExplicitDeviceIDs(gpu_ids)) )
             std::cout << "gpu-ids not specified, threads will automatically be mapped to devices (incrementally)."<< std::endl;
         else
         {
@@ -1714,7 +1714,7 @@ void MlOptimiser::initialise()
 
 		bool fullAutomaticMapping;
 		bool semiAutomaticMapping;
-		if (allThreadIDs[0].size()==0 || ! std::isdigit(*gpu_ids.begin()) )
+		if (allThreadIDs[0].size()==0 || !hasExplicitDeviceIDs(gpu_ids) )
 		{
 			fullAutomaticMapping = true;
 			semiAutomaticMapping = true;
diff --git a/src/ml_optimiser_mpi.cpp b/src/ml_optimiser_mpi.cpp
index e84666a81..397264518 100644
--- a/src/ml_optimiser_mpi.cpp
+++ b/src/ml_optimiser_mpi.cpp
@@ -241,7 +241,7 @@ void MlOptimiserMpi::initialise()
 				fullAutomaticMapping = true;
 				semiAutomaticMapping = true; // possible to set fully manual for specific ranks
 
-				if ((allThreadIDs.size()<rank) || allThreadIDs[0].size()==0 || (!std::isdigit(*gpu_ids.begin())) )
+				if ((allThreadIDs.size()<rank) || allThreadIDs[0].size()==0 || (!hasExplicitDeviceIDs(gpu_ids)) )
 				{
 					std::cout << "GPU-ids not specified for this rank, threads will automatically be mapped to available devices."<< std::endl;
 				}
@@ -485,7 +485,7 @@ will still yield good performance and possibly a more stable execution. \n" << s
 	{
 		if (node->isLeader())
 		{
-			if (! std::isdigit(*gpu_ids.begin()))
+			if (!hasExplicitDeviceIDs(gpu_ids))
 			{
 				std::cout << std::string(80, '*') << std::endl;
 				std::cout << "GPU-ids not specified and MPI/threads will automatically be mapped to available devices."<< std::endl;
@@ -626,7 +626,7 @@ will still yield good performance and possibly a more stable execution. \n" << s
 
 			bool fullAutomaticMapping;
 			bool semiAutomaticMapping;
-			if (allThreadIDs[node->rank-1].size()==0 || ! std::isdigit(*gpu_ids.begin()) )
+			if (allThreadIDs[node->rank-1].size()==0 || !hasExplicitDeviceIDs(gpu_ids) )
 			{
 				fullAutomaticMapping = true;
 				semiAutomaticMapping = true;
diff --git a/src/motioncorr_runner.cpp b/src/motioncorr_runner.cpp
index fa79070a0..9ee59c5ec 100644
--- a/src/motioncorr_runner.cpp
+++ b/src/motioncorr_runner.cpp
@@ -636,12 +636,10 @@ bool MotioncorrRunner::executeMotioncor2(Micrograph &mic, int rank)
 	}
 	else
 	{
-		if (rank >= allThreadIDs.size())
-			REPORT_ERROR("ERROR: not enough MPI nodes specified for the GPU IDs.");
-
 		command += " -Gpu ";
-		for (int igpu = 0; igpu < allThreadIDs[rank].size(); igpu++)
-			command += allThreadIDs[rank][igpu] + " ";
+		const std::vector<std::string> &rankThreadIDs = getDeviceIDsForRank(allThreadIDs, rank);
+		for (int igpu = 0; igpu < rankThreadIDs.size(); igpu++)
+			command += rankThreadIDs[igpu] + " ";
 	}
 
 	command += " >> " + fn_out + " 2>> " + fn_err;
diff --git a/src/motioncorr_runner.h b/src/motioncorr_runner.h
index c9661fc24..e4ad28f4d 100644
--- a/src/motioncorr_runner.h
+++ b/src/motioncorr_runner.h
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <algorithm>
+#include "src/args.h"
 #include <src/time.h>
 #include "src/metadata_table.h"
 #include "src/image.h"