From aa050ceeb32186841b84e3ad26c8aa39e64ced85 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 24 Sep 2025 16:50:23 +0200
Subject: [PATCH 01/13] hacks to deal with channel first dimension

---
 cellpose/contrib/distributed_segmentation.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cellpose/contrib/distributed_segmentation.py b/cellpose/contrib/distributed_segmentation.py
index 42a21e3a..64493733 100644
--- a/cellpose/contrib/distributed_segmentation.py
+++ b/cellpose/contrib/distributed_segmentation.py
@@ -529,20 +529,27 @@ def remove_overlaps(array, crop, overlap, blocksize):
     """overlaps only there to provide context for boundary voxels
        and can be removed after segmentation is complete
        reslice array to remove the overlaps"""
-    crop_trimmed = list(crop)
+    
+    crop = list(s for s in crop if not isinstance(s, int))
+    assert len(crop) == array.ndim
+    crop_trimmed = crop
+
     for axis in range(array.ndim):
+
         if crop[axis].start != 0:
             slc = [slice(None),]*array.ndim
             slc[axis] = slice(overlap, None)
-            array = array[tuple(slc)]
+            array = array[tuple(slc)] # Why trimming array axis by axis?
             a, b = crop[axis].start, crop[axis].stop
             crop_trimmed[axis] = slice(a + overlap, b)
+
         if array.shape[axis] > blocksize[axis]:
             slc = [slice(None),]*array.ndim
             slc[axis] = slice(None, blocksize[axis])
             array = array[tuple(slc)]
             a = crop_trimmed[axis].start
             crop_trimmed[axis] = slice(a, a + blocksize[axis])
+
     return array, crop_trimmed
 
 
@@ -559,6 +566,7 @@ def bounding_boxes_in_global_coordinates(segmentation, crop):
 
 def get_nblocks(shape, blocksize):
     """Given a shape and blocksize determine the number of blocks per axis"""
+    shape = shape[len(shape) - len(blocksize):]
     return np.ceil(np.array(shape) / blocksize).astype(int)
 
 
@@ -802,14 +810,14 @@ def get_block_crops(shape, blocksize, overlap, mask):
         stop = start + blocksize + 2 * overlap
         start = np.maximum(0, start)
         stop = np.minimum(shape, stop)
-        crop = tuple(slice(x, y) for x, y in zip(start, stop))
+        crop = tuple(0 for _ in range(len(shape) - len(blocksize))) + tuple(slice(x, y) for x, y in zip(start, stop))
 
         foreground = True
         if mask is not None:
             start = mask_blocksize * index
             stop = start + mask_blocksize
             stop = np.minimum(mask.shape, stop)
-            mask_crop = tuple(slice(x, y) for x, y in zip(start, stop))
+            mask_crop = tuple(0 for _ in range(len(shape) - len(blocksize))) + tuple(slice(x, y) for x, y in zip(start, stop))
             if not np.any(mask[mask_crop]): foreground = False
         if foreground:
             indices.append(index)

From 5661db40f69572e8761c1ec3fdf1ff152e7442de Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Thu, 25 Sep 2025 15:08:58 +0200
Subject: [PATCH 02/13] feature: Add proof-of-concept SlurmCluster

---
 cellpose/contrib/distributed_segmentation.py | 120 ++++++++++++++++++-
 1 file changed, 115 insertions(+), 5 deletions(-)

diff --git a/cellpose/contrib/distributed_segmentation.py b/cellpose/contrib/distributed_segmentation.py
index 64493733..44dac907 100644
--- a/cellpose/contrib/distributed_segmentation.py
+++ b/cellpose/contrib/distributed_segmentation.py
@@ -214,6 +214,112 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.client.close()
         super().__exit__(exc_type, exc_value, traceback)
 
+class SlurmCluster(dask_jobqueue.SLURMCluster):
+    """
+    This is a thin wrapper extending dask_jobqueue.SLURMCluster,
+    which in turn extends dask.distributed.SpecCluster. This wrapper
+    sets configs before the cluster or workers are initialized. This is
+    an adaptive cluster and will scale the number of workers, between user
+    specified limits, based on the number of pending tasks.
+
+    For a full list of arguments see
+    https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
+
+    Most users will only need to specify:
+    ncpus (the number of cpu cores per worker)
+    min_workers
+    max_workers
+    """
+
+    def __init__(
+        self,
+        ncpus,
+        min_workers,
+        max_workers,
+        config={},
+        config_name=DEFAULT_CONFIG_FILENAME,
+        persist_config=False,
+        scratch_dir = f"/scratch/{getpass.getuser()}/",
+        **kwargs
+    ):
+
+        # store all args in case needed later
+        self.locals_store = {**locals()}
+
+        # config
+        self.config_name = config_name
+        self.persist_config = persist_config
+        config_defaults = {
+            'temporary-directory':scratch_dir,
+            'distributed.comm.timeouts.connect':'180s',
+            'distributed.comm.timeouts.tcp':'360s',
+        }
+        config = {**config_defaults, **config}
+        _modify_dask_config(config, config_name)
+
+        # threading is best in low level libraries
+        job_script_prologue = [
+            f"export MKL_NUM_THREADS={2*ncpus}",
+            f"export NUM_MKL_THREADS={2*ncpus}",
+            f"export OPENBLAS_NUM_THREADS={2*ncpus}",
+            f"export OPENMP_NUM_THREADS={2*ncpus}",
+            f"export OMP_NUM_THREADS={2*ncpus}",
+        ]
+
+        # set scratch and log directories
+        if "local_directory" not in kwargs:
+            kwargs["local_directory"] = scratch_dir
+        if "log_directory" not in kwargs:
+            log_dir = f"{os.getcwd()}/dask_worker_logs_{os.getpid()}/"
+            pathlib.Path(log_dir).mkdir(parents=False, exist_ok=True)
+            kwargs["log_directory"] = log_dir
+
+        # construct
+        super().__init__(
+            processes=1,
+            cores=ncpus,
+            memory=str(15*ncpus)+'GB',
+            job_script_prologue=job_script_prologue,
+            **kwargs,
+        )
+        self.client = distributed.Client(self)
+        print("Cluster dashboard link: ", self.dashboard_link)
+
+        # set adaptive cluster bounds
+        self.adapt_cluster(min_workers, max_workers)
+
+
+    def __enter__(self): return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if not self.persist_config:
+            _remove_config_file(self.config_name)
+        self.client.close()
+        super().__exit__(exc_type, exc_value, traceback)
+
+
+    def adapt_cluster(self, min_workers, max_workers):
+        _ = self.adapt(
+            minimum_jobs=min_workers,
+            maximum_jobs=max_workers,
+            interval='10s',
+            wait_count=6,
+        )
+
+
+    def change_worker_attributes(
+        self,
+        min_workers,
+        max_workers,
+        **kwargs,
+    ):
+        """WARNING: this function is dangerous if you don't know what
+           you're doing. Don't call this unless you know exactly what
+           this does."""
+        self.scale(0)
+        for k, v in kwargs.items():
+            self.new_spec['options'][k] = v
+        self.adapt_cluster(min_workers, max_workers)
+
 
 class janeliaLSFCluster(dask_jobqueue.LSFCluster):
     """
@@ -349,9 +455,6 @@ def create_or_pass_cluster(*args, **kwargs):
         "Either cluster or cluster_kwargs must be defined"
         if not 'cluster' in kwargs:
             cluster_constructor = myLocalCluster
-            F = lambda x: x in kwargs['cluster_kwargs']
-            if F('ncpus') and F('min_workers') and F('max_workers'):
-                cluster_constructor = janeliaLSFCluster
             with cluster_constructor(**kwargs['cluster_kwargs']) as cluster:
                 kwargs['cluster'] = cluster
                 return func(*args, **kwargs)
@@ -757,6 +860,7 @@ class in this module. If you are running on the Janelia LSF cluster, see
             worker_logs_directory=str(worker_logs_dir),
         )
         results = cluster.client.gather(futures)
+        print('Scale cluster down')
         if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
             cluster.scale(0)
 
@@ -768,27 +872,33 @@ class in this module. If you are running on the Janelia LSF cluster, see
         new_labeling_path = temporary_directory + '/new_labeling.npy'
         np.save(new_labeling_path, new_labeling)
 
+        print('Change worker attributes')
         # stitching step is cheap, we should release gpus and use small workers
         if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
             cluster.change_worker_attributes(
                 min_workers=cluster.locals_store['min_workers'],
                 max_workers=cluster.locals_store['max_workers'],
-                ncpus=1,
+                cores=1,
                 memory="15GB",
-                mem=int(15e9),
+                #mem=int(15e9),
                 queue=None,
                 job_extra_directives=[],
             )
     
+        print('Use dask array to relabel segmentations')
         segmentation_da = dask.array.from_zarr(temp_zarr)
+        print('Map dask to blocks')
         relabeled = dask.array.map_blocks(
             lambda block: np.load(new_labeling_path)[block],
             segmentation_da,
             dtype=np.uint32,
             chunks=segmentation_da.chunks,
         )
+        print("Write output")
         dask.array.to_zarr(relabeled, write_path, overwrite=True)
+        print("Merge bboxes")
         merged_boxes = merge_all_boxes(boxes, new_labeling[box_ids])
+        print("Merge done")
         return zarr.open(write_path, mode='r'), merged_boxes
 
 

From cb9a82a8e4152de2ecb69c69e8bb49faa1b7e080 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Mon, 29 Sep 2025 17:00:31 +0200
Subject: [PATCH 03/13] need flexibility to change ad-hoc conda environments

---
 cellpose/contrib/distributed_segmentation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cellpose/contrib/distributed_segmentation.py b/cellpose/contrib/distributed_segmentation.py
index 44dac907..3b9d646d 100644
--- a/cellpose/contrib/distributed_segmentation.py
+++ b/cellpose/contrib/distributed_segmentation.py
@@ -240,6 +240,7 @@ def __init__(
         config_name=DEFAULT_CONFIG_FILENAME,
         persist_config=False,
         scratch_dir = f"/scratch/{getpass.getuser()}/",
+        job_script_prologue = [],
         **kwargs
     ):
 
@@ -264,7 +265,7 @@ def __init__(
             f"export OPENBLAS_NUM_THREADS={2*ncpus}",
             f"export OPENMP_NUM_THREADS={2*ncpus}",
             f"export OMP_NUM_THREADS={2*ncpus}",
-        ]
+        ] + job_script_prologue
 
         # set scratch and log directories
         if "local_directory" not in kwargs:

From 37cc421a99964524f96576dc7e2b2eb7f8e96060 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Mon, 29 Sep 2025 17:01:14 +0200
Subject: [PATCH 04/13] create ROCM-enabled conda environment

---
 environment-rocm.yml | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 environment-rocm.yml

diff --git a/environment-rocm.yml b/environment-rocm.yml
new file mode 100644
index 00000000..958c6b5c
--- /dev/null
+++ b/environment-rocm.yml
@@ -0,0 +1,31 @@
+name: cellpose
+dependencies:
+  - python==3.9.23
+  - pip
+  - pip:
+    - --index-url https://download.pytorch.org/whl/rocm6.4
+    - --extra-index-url https://pypi.org/simple
+    - qtpy
+#    - PyQt5.sip
+    - numpy>=1.20.0
+    - scipy
+    - torch>=1.6
+    - opencv-python-headless
+    - pyqtgraph>=0.11.0rc0
+    - natsort
+    - google-cloud-storage
+    - tqdm
+    - tifffile
+    - fastremap
+    - cellpose
+    - roifile
+    - pyqt5
+    - dask
+    - distributed
+    - dask-image
+    - pyyaml
+    - zarr
+    - dask_jobqueue
+    - bokeh
+    - fill-voids
+  

From 1155413bf5c4e24c2cd0a12e8764780d6e0e8932 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Mon, 29 Sep 2025 17:06:50 +0200
Subject: [PATCH 05/13] poc: Cellpose on slurm cluster

---
 cellpose/contrib/test_slurm.py | 105 +++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 cellpose/contrib/test_slurm.py

diff --git a/cellpose/contrib/test_slurm.py b/cellpose/contrib/test_slurm.py
new file mode 100644
index 00000000..00ee153c
--- /dev/null
+++ b/cellpose/contrib/test_slurm.py
@@ -0,0 +1,105 @@
+from pathlib import Path
+
+import imageio
+import zarr
+import numpy
+from tifffile import imwrite
+from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
+
+input_tif_path = Path('volumedata.tif')
+input_zarr_path = Path('volumedata.zarr')
+output_tif_path = Path('output.tif')
+
+scratch_dir = Path.home() / 'link_scratch'
+output_zarr_path = scratch_dir / 'output.zarr'
+
+
+if not input_zarr_path.exists():
+    print('Test zarr data not found')
+    if not input_tif_path.is_file():
+        print('Download test data')
+        # Just get image data from somewhere. Note that cellpose is probably the best choice for segmenting it
+        data_numpy = imageio.imread('https://documents.epfl.ch/groups/c/cv/cvlab-unit/www/data/%20ElectronMicroscopy_Hippocampus/volumedata.tif')
+        imageio.imwrite(input_tif_path, data_numpy)
+    else:
+        print('Load test data from disk')
+        data_numpy = imageio.imread(input_tif_path)
+
+    print('Convert data to zarr')
+    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
+    del data_numpy
+else:
+    data_zarr = zarr.open(input_zarr_path)
+
+
+# Test a single block first
+from cellpose.contrib.distributed_segmentation import process_block
+
+# parameterize cellpose however you like
+model_kwargs = {'gpu':True}
+eval_kwargs = {
+    'z_axis':0,
+    'do_3D':True,
+}
+
+if small_test := False:
+    # define a crop as the distributed function would
+    starts = (128, 128, 128)
+    blocksize = (256, 256, 256)
+    overlap = 60
+    crop = tuple(slice(s-overlap, s+b+overlap) for s, b in zip(starts, blocksize))
+
+    # call the segmentation
+    segments, boxes, box_ids = process_block(
+        block_index=(0, 0, 0),  # when test_mode=True this is just a dummy value
+        crop=crop,
+        input_zarr=data_zarr,
+        model_kwargs=model_kwargs,
+        eval_kwargs=eval_kwargs,
+        blocksize=blocksize,
+        overlap=overlap,
+        output_zarr=None,
+        test_mode=True,
+    )
+
+    imwrite(output_tif_path, segments, compression ='zlib')
+
+else:
+    from cellpose.contrib.distributed_segmentation import distributed_eval, SlurmCluster
+
+    # define cluster parameters
+    cluster_kwargs = {
+        'ncpus':2,                # cpus per worker
+        'min_workers':1,          # cluster adapts number of workers based on number of blocks
+        'max_workers':16,
+        'queue': 'apu',
+        'interface': 'ib0',
+        'scratch_dir': str(scratch_dir),
+        'walltime': '1:00:00', # TODO: find realistic wall time
+        'job_extra_directives':['--constraint apu', '--gres gpu:1'],
+        'worker_extra_args': ["--lifetime", "55m", "--lifetime-stagger", "4m"],
+        'job_script_prologue' : [
+            "module load condainer",
+            f"cd {str(Path.home())}/src/cellpose/env",
+            "source activate",
+            f"cd {str(Path.home())}/src/cellpose",
+        ],
+    }
+
+    slurm_cluster = SlurmCluster(
+        **cluster_kwargs
+    )
+
+    # run segmentation
+    # outputs:
+    #     segments: zarr array containing labels
+    #     boxes: list of bounding boxes around all labels (very useful for navigating big data)
+    segments, boxes = distributed_eval(
+        input_zarr=data_zarr,
+        blocksize=(256, 256, 256),
+        write_path=str(output_zarr_path),
+        model_kwargs=model_kwargs,
+        eval_kwargs=eval_kwargs,
+        cluster = slurm_cluster,
+    )
+

From 981f86af4518fd2ede98470b1c85b7dff1d9ed6f Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Tue, 30 Sep 2025 18:24:09 +0200
Subject: [PATCH 06/13] remove scaling for stability; rename test_slurm; add
 cluster detection

---
 cellpose/contrib/cluster_script.py           |  88 ++++++++++++++++
 cellpose/contrib/distributed_segmentation.py |  45 ++++----
 cellpose/contrib/test_slurm.py               | 105 -------------------
 3 files changed, 113 insertions(+), 125 deletions(-)
 create mode 100644 cellpose/contrib/cluster_script.py
 delete mode 100644 cellpose/contrib/test_slurm.py

diff --git a/cellpose/contrib/cluster_script.py b/cellpose/contrib/cluster_script.py
new file mode 100644
index 00000000..603f8bc6
--- /dev/null
+++ b/cellpose/contrib/cluster_script.py
@@ -0,0 +1,88 @@
+from pathlib import Path
+import subprocess
+import pickle
+
+import imageio
+import zarr
+import numpy
+from tifffile import imwrite
+from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
+
+from cellpose.contrib.distributed_segmentation import distributed_eval
+from cellpose.contrib.distributed_segmentation import SlurmCluster, JaneliaLSFCluster
+
+## Parameters needs to be modified based 
+output_dir = Path.home() / 'link_scratch'
+
+# define cluster parameters
+cluster_kwargs = {
+    'job_cpu': 2
+    'ncpus':1,                  # cpus requested per worker
+    'min_workers':1,            # cluster adapts number of workers based on number of blocks
+    'max_workers':16,
+    'walltime': '1:00:00',      # TODO: find realistic wall time
+    'queue': 'GPU',             # Queue name for running (single) GPU jobs -> Ask your local HPC support
+    'interface': 'ib0',         # Interface name for compute-node communication -> 
+    'local_directory': '/tmp',  # worker local temporary directory -> Ask you local HPC support 
+    'job_extra_directives': [
+        '--gres gpu:1'
+    ],
+}
+
+input_zarr_path = output_dir / 'input.zarr'
+output_zarr_path = output_dir / 'segmentation.zarr'
+output_bbox_pkl = output_dir / 'bboxes.pkl'
+
+if not input_zarr_path.exists():
+    print('Download test data (requires internet access)')
+    crop = (slice(0,1), slice(2048,3072), slice(2048,3072), slice(0:1024))
+    data_numpy = zarr.open("https://webknossos-data.mpinb.mpg.de/data/zarr/653bd498010000ae005914a1/color/16-16-2", mode='r')[crop]
+
+    print('Save as 3D local zarr array')
+    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
+    del data_numpy
+else:
+    data_zarr = zarr.open(input_zarr_path)
+
+# parameterize cellpose however you like
+model_kwargs = {'gpu':True}
+eval_kwargs = {
+    'z_axis':0,
+    'do_3D':True,
+}
+
+# Guess cluster type by checking for cluster submission commands 
+if subprocess.getstatusoutput('sbatch -h')[0] == 0:
+    print('sbatch command detected -> use SlurmCluster')
+    cluster = SlurmCluster(**cluster_kwargs)
+elif subprocess.getstatusoutput('bsub -h')[0] == 0:
+    print('bsub command detected -> use JaneliaLSFCLuster')
+    cluster = JaneliaLSFCluster(**cluster_kwargs)
+else:
+    cluster = None
+
+if cluster is None:
+    raise Exception(
+        "Neither SLURM nor LFS cluster detected. "
+        "Currently, this script only supports SLURM or LSF cluster scheduler. "
+        "You have two options:"
+        "\n * Either use `distributed_eval` without the `cluster` but with the `cluster_kwargs` argument to start a local cluster on your machine"
+        "\n * or raise a feature request at https://github.com/MouseLand/cellpose/issues."
+    )
+
+# Start evaluation
+segments, boxes = distributed_eval(
+    input_zarr=data_zarr,
+    blocksize=(256, 256, 256),
+    write_path=str(output_zarr_path),
+    model_kwargs=model_kwargs,
+    eval_kwargs=eval_kwargs,
+    cluster = cluster,
+)
+
+# Save boxes on disk
+with open(output_bbox_pkl, 'wb') as f:
+    pickle.dump(boxes, f)
+
+print(f'Segmentation saved in {str(output_zarr_path)}')
+print(f'Object boxes saved in {str(output_bbox_pkl)}')
diff --git a/cellpose/contrib/distributed_segmentation.py b/cellpose/contrib/distributed_segmentation.py
index 3b9d646d..d514fa35 100644
--- a/cellpose/contrib/distributed_segmentation.py
+++ b/cellpose/contrib/distributed_segmentation.py
@@ -239,7 +239,7 @@ def __init__(
         config={},
         config_name=DEFAULT_CONFIG_FILENAME,
         persist_config=False,
-        scratch_dir = f"/scratch/{getpass.getuser()}/",
+        local_directory = f"/scratch/{getpass.getuser()}/",
         job_script_prologue = [],
         **kwargs
     ):
@@ -251,7 +251,7 @@ def __init__(
         self.config_name = config_name
         self.persist_config = persist_config
         config_defaults = {
-            'temporary-directory':scratch_dir,
+            'temporary-directory':local_directory,
             'distributed.comm.timeouts.connect':'180s',
             'distributed.comm.timeouts.tcp':'360s',
         }
@@ -267,9 +267,7 @@ def __init__(
             f"export OMP_NUM_THREADS={2*ncpus}",
         ] + job_script_prologue
 
-        # set scratch and log directories
-        if "local_directory" not in kwargs:
-            kwargs["local_directory"] = scratch_dir
+        # set log directories
         if "log_directory" not in kwargs:
             log_dir = f"{os.getcwd()}/dask_worker_logs_{os.getpid()}/"
             pathlib.Path(log_dir).mkdir(parents=False, exist_ok=True)
@@ -625,6 +623,7 @@ def read_preprocess_and_segment(
         log_file = f'dask_worker_{distributed.get_worker().name}.log'
         log_file = pathlib.Path(worker_logs_directory).joinpath(log_file)
     cellpose.io.logger_setup(stdout_file_replacement=log_file)
+    print(os.environ.get("HOSTNAME"))
     model = cellpose.models.CellposeModel(**model_kwargs)
     return model.eval(image, **eval_kwargs)[0].astype(np.uint32)
 
@@ -860,11 +859,15 @@ class in this module. If you are running on the Janelia LSF cluster, see
             output_zarr=temp_zarr,
             worker_logs_directory=str(worker_logs_dir),
         )
+
+        print('Gather data in host process')
         results = cluster.client.gather(futures)
-        print('Scale cluster down')
-        if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
-            cluster.scale(0)
 
+        #print('Scale cluster down')
+        #if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
+        #    cluster.scale(0)
+
+        print('Process results locally')
         faces, boxes_, box_ids_ = list(zip(*results))
         boxes = [box for sublist in boxes_ for box in sublist]
         box_ids = np.concatenate(box_ids_).astype(int)  # unsure how but without cast these are float64
@@ -873,18 +876,18 @@ class in this module. If you are running on the Janelia LSF cluster, see
         new_labeling_path = temporary_directory + '/new_labeling.npy'
         np.save(new_labeling_path, new_labeling)
 
-        print('Change worker attributes')
-        # stitching step is cheap, we should release gpus and use small workers
-        if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
-            cluster.change_worker_attributes(
-                min_workers=cluster.locals_store['min_workers'],
-                max_workers=cluster.locals_store['max_workers'],
-                cores=1,
-                memory="15GB",
-                #mem=int(15e9),
-                queue=None,
-                job_extra_directives=[],
-            )
+        #print('Change worker attributes')
+        ## stitching step is cheap, we should release gpus and use small workers
+        #if isinstance(cluster, dask_jobqueue.core.JobQueueCluster): 
+        #    cluster.change_worker_attributes(
+        #        min_workers=cluster.locals_store['min_workers'],
+        #        max_workers=cluster.locals_store['max_workers'],
+        #        cores=1,
+        #        memory="15GB",
+        #        #mem=int(15e9),
+        #        queue="CPU",
+        #        job_extra_directives=[],
+        #    )
     
         print('Use dask array to relabel segmentations')
         segmentation_da = dask.array.from_zarr(temp_zarr)
@@ -897,6 +900,8 @@ class in this module. If you are running on the Janelia LSF cluster, see
         )
         print("Write output")
         dask.array.to_zarr(relabeled, write_path, overwrite=True)
+        # TODO(erjel): Scale cluster down again?
+
         print("Merge bboxes")
         merged_boxes = merge_all_boxes(boxes, new_labeling[box_ids])
         print("Merge done")
diff --git a/cellpose/contrib/test_slurm.py b/cellpose/contrib/test_slurm.py
deleted file mode 100644
index 00ee153c..00000000
--- a/cellpose/contrib/test_slurm.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from pathlib import Path
-
-import imageio
-import zarr
-import numpy
-from tifffile import imwrite
-from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
-
-input_tif_path = Path('volumedata.tif')
-input_zarr_path = Path('volumedata.zarr')
-output_tif_path = Path('output.tif')
-
-scratch_dir = Path.home() / 'link_scratch'
-output_zarr_path = scratch_dir / 'output.zarr'
-
-
-if not input_zarr_path.exists():
-    print('Test zarr data not found')
-    if not input_tif_path.is_file():
-        print('Download test data')
-        # Just get image data from somewhere. Note that cellpose is probably the best choice for segmenting it
-        data_numpy = imageio.imread('https://documents.epfl.ch/groups/c/cv/cvlab-unit/www/data/%20ElectronMicroscopy_Hippocampus/volumedata.tif')
-        imageio.imwrite(input_tif_path, data_numpy)
-    else:
-        print('Load test data from disk')
-        data_numpy = imageio.imread(input_tif_path)
-
-    print('Convert data to zarr')
-    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
-    del data_numpy
-else:
-    data_zarr = zarr.open(input_zarr_path)
-
-
-# Test a single block first
-from cellpose.contrib.distributed_segmentation import process_block
-
-# parameterize cellpose however you like
-model_kwargs = {'gpu':True}
-eval_kwargs = {
-    'z_axis':0,
-    'do_3D':True,
-}
-
-if small_test := False:
-    # define a crop as the distributed function would
-    starts = (128, 128, 128)
-    blocksize = (256, 256, 256)
-    overlap = 60
-    crop = tuple(slice(s-overlap, s+b+overlap) for s, b in zip(starts, blocksize))
-
-    # call the segmentation
-    segments, boxes, box_ids = process_block(
-        block_index=(0, 0, 0),  # when test_mode=True this is just a dummy value
-        crop=crop,
-        input_zarr=data_zarr,
-        model_kwargs=model_kwargs,
-        eval_kwargs=eval_kwargs,
-        blocksize=blocksize,
-        overlap=overlap,
-        output_zarr=None,
-        test_mode=True,
-    )
-
-    imwrite(output_tif_path, segments, compression ='zlib')
-
-else:
-    from cellpose.contrib.distributed_segmentation import distributed_eval, SlurmCluster
-
-    # define cluster parameters
-    cluster_kwargs = {
-        'ncpus':2,                # cpus per worker
-        'min_workers':1,          # cluster adapts number of workers based on number of blocks
-        'max_workers':16,
-        'queue': 'apu',
-        'interface': 'ib0',
-        'scratch_dir': str(scratch_dir),
-        'walltime': '1:00:00', # TODO: find realistic wall time
-        'job_extra_directives':['--constraint apu', '--gres gpu:1'],
-        'worker_extra_args': ["--lifetime", "55m", "--lifetime-stagger", "4m"],
-        'job_script_prologue' : [
-            "module load condainer",
-            f"cd {str(Path.home())}/src/cellpose/env",
-            "source activate",
-            f"cd {str(Path.home())}/src/cellpose",
-        ],
-    }
-
-    slurm_cluster = SlurmCluster(
-        **cluster_kwargs
-    )
-
-    # run segmentation
-    # outputs:
-    #     segments: zarr array containing labels
-    #     boxes: list of bounding boxes around all labels (very useful for navigating big data)
-    segments, boxes = distributed_eval(
-        input_zarr=data_zarr,
-        blocksize=(256, 256, 256),
-        write_path=str(output_zarr_path),
-        model_kwargs=model_kwargs,
-        eval_kwargs=eval_kwargs,
-        cluster = slurm_cluster,
-    )
-

From c88660dfb5e565315f933165fa9729b5276a177c Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 1 Oct 2025 11:12:39 +0200
Subject: [PATCH 07/13] example working end-to-end

---
 cellpose/contrib/cluster_script.py | 41 ++++++++++++++++--------------
 environment-rocm.yml               |  1 +
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/cellpose/contrib/cluster_script.py b/cellpose/contrib/cluster_script.py
index 603f8bc6..7120995e 100644
--- a/cellpose/contrib/cluster_script.py
+++ b/cellpose/contrib/cluster_script.py
@@ -9,37 +9,40 @@
 from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
 
 from cellpose.contrib.distributed_segmentation import distributed_eval
-from cellpose.contrib.distributed_segmentation import SlurmCluster, JaneliaLSFCluster
+from cellpose.contrib.distributed_segmentation import SlurmCluster, janeliaLSFCluster
 
-## Parameters needs to be modified based 
+## PARAMETERS
+# Compute node accessible directory for test input zarr dataset and outputs 
 output_dir = Path.home() / 'link_scratch'
 
-# define cluster parameters
+# Cluster parameters (here: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
 cluster_kwargs = {
-    'job_cpu': 2
-    'ncpus':1,                  # cpus requested per worker
-    'min_workers':1,            # cluster adapts number of workers based on number of blocks
-    'max_workers':16,
-    'walltime': '1:00:00',      # TODO: find realistic wall time
-    'queue': 'GPU',             # Queue name for running (single) GPU jobs -> Ask your local HPC support
-    'interface': 'ib0',         # Interface name for compute-node communication -> 
-    'local_directory': '/tmp',  # worker local temporary directory -> Ask you local HPC support 
-    'job_extra_directives': [
-        '--gres gpu:1'
+    'job_cpu': 2,               # number of CPUs per GPU worker
+    'ncpus':1,                  # threads requested per GPU worker
+    'min_workers':1,            # min number of workers based on expected workload
+    'max_workers':16,           # max number of workers based on expected workload 
+    'walltime': '1:00:00',      # available runtime for each GPU worker for cluster scheduler (Slurm, LSF)
+    'queue': 'apu',             # queue/ partition name for single GPU worker *
+    'interface': 'ib0',         # interface name for compute-node communication *
+    'local_directory': '/tmp',  # compute node local temporary directory *
+    'job_extra_directives': [   # extra directives for scheduler (here: Slurm) *
+        '--constraint apu',
+        '--gres gpu:1',
     ],
 }
+# * Ask your cluster support staff for assistance
 
 input_zarr_path = output_dir / 'input.zarr'
 output_zarr_path = output_dir / 'segmentation.zarr'
 output_bbox_pkl = output_dir / 'bboxes.pkl'
 
 if not input_zarr_path.exists():
-    print('Download test data (requires internet access)')
-    crop = (slice(0,1), slice(2048,3072), slice(2048,3072), slice(0:1024))
+    print('Download (1024 x 1024 x 1024) test data')
+    crop = (slice(0,1), slice(2048,3072), slice(2048,3072), slice(0,1024))
     data_numpy = zarr.open("https://webknossos-data.mpinb.mpg.de/data/zarr/653bd498010000ae005914a1/color/16-16-2", mode='r')[crop]
 
     print('Save as 3D local zarr array')
-    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
+    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy.squeeze(0), chunks=(256, 256, 256))
     del data_numpy
 else:
     data_zarr = zarr.open(input_zarr_path)
@@ -53,11 +56,11 @@
 
 # Guess cluster type by checking for cluster submission commands 
 if subprocess.getstatusoutput('sbatch -h')[0] == 0:
-    print('sbatch command detected -> use SlurmCluster')
+    print('Slurm sbatch command detected -> use SlurmCluster')
     cluster = SlurmCluster(**cluster_kwargs)
 elif subprocess.getstatusoutput('bsub -h')[0] == 0:
-    print('bsub command detected -> use JaneliaLSFCLuster')
-    cluster = JaneliaLSFCluster(**cluster_kwargs)
+    print('LSF bsub command detected -> use janeliaLSFCLuster')
+    cluster = janeliaLSFCluster(**cluster_kwargs)
 else:
     cluster = None
 
diff --git a/environment-rocm.yml b/environment-rocm.yml
index 958c6b5c..3e7a1b57 100644
--- a/environment-rocm.yml
+++ b/environment-rocm.yml
@@ -28,4 +28,5 @@ dependencies:
     - dask_jobqueue
     - bokeh
     - fill-voids
+    - aiohttp
   

From d8ca8997f7dbab1b41f6052d6de51bf40287bf48 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 1 Oct 2025 11:18:00 +0200
Subject: [PATCH 08/13] rf: clean-up test crops and hostname print

---
 cellpose/contrib/distributed_segmentation.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/cellpose/contrib/distributed_segmentation.py b/cellpose/contrib/distributed_segmentation.py
index d514fa35..c306613b 100644
--- a/cellpose/contrib/distributed_segmentation.py
+++ b/cellpose/contrib/distributed_segmentation.py
@@ -623,7 +623,6 @@ def read_preprocess_and_segment(
         log_file = f'dask_worker_{distributed.get_worker().name}.log'
         log_file = pathlib.Path(worker_logs_directory).joinpath(log_file)
     cellpose.io.logger_setup(stdout_file_replacement=log_file)
-    print(os.environ.get("HOSTNAME"))
     model = cellpose.models.CellposeModel(**model_kwargs)
     return model.eval(image, **eval_kwargs)[0].astype(np.uint32)
 
@@ -632,27 +631,21 @@ def remove_overlaps(array, crop, overlap, blocksize):
     """overlaps only there to provide context for boundary voxels
        and can be removed after segmentation is complete
        reslice array to remove the overlaps"""
-    
-    crop = list(s for s in crop if not isinstance(s, int))
-    assert len(crop) == array.ndim
-    crop_trimmed = crop
+    crop_trimmed = list(crop)
 
     for axis in range(array.ndim):
-
         if crop[axis].start != 0:
             slc = [slice(None),]*array.ndim
             slc[axis] = slice(overlap, None)
-            array = array[tuple(slc)] # Why trimming array axis by axis?
+            array = array[tuple(slc)]
             a, b = crop[axis].start, crop[axis].stop
             crop_trimmed[axis] = slice(a + overlap, b)
-
         if array.shape[axis] > blocksize[axis]:
             slc = [slice(None),]*array.ndim
             slc[axis] = slice(None, blocksize[axis])
             array = array[tuple(slc)]
             a = crop_trimmed[axis].start
             crop_trimmed[axis] = slice(a, a + blocksize[axis])
-
     return array, crop_trimmed
 
 
@@ -669,7 +662,6 @@ def bounding_boxes_in_global_coordinates(segmentation, crop):
 
 def get_nblocks(shape, blocksize):
     """Given a shape and blocksize determine the number of blocks per axis"""
-    shape = shape[len(shape) - len(blocksize):]
     return np.ceil(np.array(shape) / blocksize).astype(int)
 
 
@@ -926,14 +918,14 @@ def get_block_crops(shape, blocksize, overlap, mask):
         stop = start + blocksize + 2 * overlap
         start = np.maximum(0, start)
         stop = np.minimum(shape, stop)
-        crop = tuple(0 for _ in range(len(shape) - len(blocksize))) + tuple(slice(x, y) for x, y in zip(start, stop))
+        crop = tuple(slice(x, y) for x, y in zip(start, stop))
 
         foreground = True
         if mask is not None:
             start = mask_blocksize * index
             stop = start + mask_blocksize
             stop = np.minimum(mask.shape, stop)
-            mask_crop = tuple(0 for _ in range(len(shape) - len(blocksize))) + tuple(slice(x, y) for x, y in zip(start, stop))
+            mask_crop = tuple(slice(x, y) for x, y in zip(start, stop))
             if not np.any(mask[mask_crop]): foreground = False
         if foreground:
             indices.append(index)

From c978b29e62fccca52e5f5246bbdd4f43249515d3 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Tue, 18 Nov 2025 21:20:18 +0100
Subject: [PATCH 09/13] [skip ci] test run with gastruloid data

---
 cellpose/contrib/cluster_script.py | 180 ++++++++++++++++-------------
 environment-rocm.yml               |   3 +-
 environment.yml                    |   5 +-
 3 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/cellpose/contrib/cluster_script.py b/cellpose/contrib/cluster_script.py
index 7120995e..37d7baa7 100644
--- a/cellpose/contrib/cluster_script.py
+++ b/cellpose/contrib/cluster_script.py
@@ -2,90 +2,108 @@
 import subprocess
 import pickle
 
-import imageio
 import zarr
-import numpy
-from tifffile import imwrite
+from tifffile import imread
+from pooch import retrieve
 from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
 
 from cellpose.contrib.distributed_segmentation import distributed_eval
-from cellpose.contrib.distributed_segmentation import SlurmCluster, janeliaLSFCluster
-
-## PARAMETERS
-# Compute node accessible directory for test input zarr dataset and outputs 
-output_dir = Path.home() / 'link_scratch'
-
-# Cluster parameters (here: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
-cluster_kwargs = {
-    'job_cpu': 2,               # number of CPUs per GPU worker
-    'ncpus':1,                  # threads requested per GPU worker
-    'min_workers':1,            # min number of workers based on expected workload
-    'max_workers':16,           # max number of workers based on expected workload 
-    'walltime': '1:00:00',      # available runtime for each GPU worker for cluster scheduler (Slurm, LSF)
-    'queue': 'apu',             # queue/ partition name for single GPU worker *
-    'interface': 'ib0',         # interface name for compute-node communication *
-    'local_directory': '/tmp',  # compute node local temporary directory *
-    'job_extra_directives': [   # extra directives for scheduler (here: Slurm) *
-        '--constraint apu',
-        '--gres gpu:1',
-    ],
-}
-# * Ask your cluster support staff for assistance
-
-input_zarr_path = output_dir / 'input.zarr'
-output_zarr_path = output_dir / 'segmentation.zarr'
-output_bbox_pkl = output_dir / 'bboxes.pkl'
-
-if not input_zarr_path.exists():
-    print('Download (1024 x 1024 x 1024) test data')
-    crop = (slice(0,1), slice(2048,3072), slice(2048,3072), slice(0,1024))
-    data_numpy = zarr.open("https://webknossos-data.mpinb.mpg.de/data/zarr/653bd498010000ae005914a1/color/16-16-2", mode='r')[crop]
-
-    print('Save as 3D local zarr array')
-    data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy.squeeze(0), chunks=(256, 256, 256))
-    del data_numpy
-else:
-    data_zarr = zarr.open(input_zarr_path)
-
-# parameterize cellpose however you like
-model_kwargs = {'gpu':True}
-eval_kwargs = {
-    'z_axis':0,
-    'do_3D':True,
-}
-
-# Guess cluster type by checking for cluster submission commands 
-if subprocess.getstatusoutput('sbatch -h')[0] == 0:
-    print('Slurm sbatch command detected -> use SlurmCluster')
-    cluster = SlurmCluster(**cluster_kwargs)
-elif subprocess.getstatusoutput('bsub -h')[0] == 0:
-    print('LSF bsub command detected -> use janeliaLSFCLuster')
-    cluster = janeliaLSFCluster(**cluster_kwargs)
-else:
-    cluster = None
-
-if cluster is None:
-    raise Exception(
-        "Neither SLURM nor LFS cluster detected. "
-        "Currently, this script only supports SLURM or LSF cluster scheduler. "
-        "You have two options:"
-        "\n * Either use `distributed_eval` without the `cluster` but with the `cluster_kwargs` argument to start a local cluster on your machine"
-        "\n * or raise a feature request at https://github.com/MouseLand/cellpose/issues."
+from cellpose.contrib.distributed_segmentation import SlurmCluster, janeliaLSFCluster, myLocalCluster
+
+
+
+def main():
+    ## PARAMETERS
+    # Compute node accessible directory for test input zarr dataset and outputs 
+    output_dir = Path.home() / 'link_scratch'
+
+    # Cluster parameters (here: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
+    cluster = {
+        'job_cpu': 2,               # number of CPUs per GPU worker
+        'ncpus':1,                  # threads requested per GPU worker
+        'min_workers':1,            # min number of workers based on expected workload
+        'max_workers':16,           # max number of workers based on expected workload 
+        'walltime': '1:00:00',      # available runtime for each GPU worker for cluster scheduler (Slurm, LSF)
+        'queue': 'apu',             # queue/ partition name for single GPU worker *
+        'interface': 'ib0',         # interface name for compute-node communication *
+        'local_directory': '/tmp',  # compute node local temporary directory *
+        'job_extra_directives': [   # extra directives for scheduler (here: Slurm) *
+            '--constraint apu',
+            '--gres gpu:1',
+        ],
+    }
+    # * Ask your cluster support staff for assistance
+
+    input_zarr_path = output_dir / 'input.zarr'
+    output_zarr_path = output_dir / 'segmentation.zarr'
+    output_bbox_pkl = output_dir / 'bboxes.pkl'
+
+
+    if not input_zarr_path.exists():
+        print('Download test data')
+        fname = retrieve(
+            url="https://zenodo.org/records/17590053/files/2d_gastruloid.tif?download=1",
+            known_hash="8ac2d944882268fbaebdfae5f7c18e4d20fdab024db2f9f02f4f45134b936872",
+            path = Path.home() / '.cellpose' / 'data',
+            progressbar=True,
+        )
+        #crop = (slice(None), slice(1024,2048), slice(1024,2048))
+        data_numpy = imread(fname)#[crop]
+
+        print('Save as 3D local zarr array')
+        data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
+        del data_numpy
+    else:
+        data_zarr = zarr.open(input_zarr_path)
+
+    # parameterize cellpose however you like
+    model_kwargs = {'gpu':True}
+    eval_kwargs = {
+        'z_axis':0,
+        'do_3D':True,
+    }
+
+    # Guess cluster type by checking for cluster submission commands 
+    if subprocess.getstatusoutput('sbatch -h')[0] == 0:
+        print('Slurm sbatch command detected -> use SlurmCluster')
+        cluster = SlurmCluster(**cluster_kwargs)
+    elif subprocess.getstatusoutput('bsub -h')[0] == 0:
+        print('LSF bsub command detected -> use janeliaLSFCLuster')
+        cluster = janeliaLSFCluster(**cluster_kwargs)
+    else:
+        cluster = None
+        #cluster = myLocalCluster(**{
+        #'n_workers':1,    # if you only have 1 gpu, then 1 worker is the right choice
+        #'ncpus':24,
+        #'memory_limit':'64GB',
+        #'threads_per_worker':1,
+        #})
+
+    if cluster is None:
+        raise Exception(
+            "Neither SLURM nor LFS cluster detected. "
+            "Currently, this script only supports SLURM or LSF cluster scheduler. "
+            "You have two options:"
+            "\n * Either use `distributed_eval` without the `cluster` but with the `cluster_kwargs` argument to start a local cluster on your machine"
+            "\n * or raise a feature request at https://github.com/MouseLand/cellpose/issues."
+        )
+
+    # Start evaluation
+    segments, boxes = distributed_eval(
+        input_zarr=data_zarr,
+        blocksize=(256, 256, 256),
+        write_path=str(output_zarr_path),
+        model_kwargs=model_kwargs,
+        eval_kwargs=eval_kwargs,
+        cluster = cluster,
     )
 
-# Start evaluation
-segments, boxes = distributed_eval(
-    input_zarr=data_zarr,
-    blocksize=(256, 256, 256),
-    write_path=str(output_zarr_path),
-    model_kwargs=model_kwargs,
-    eval_kwargs=eval_kwargs,
-    cluster = cluster,
-)
-
-# Save boxes on disk
-with open(output_bbox_pkl, 'wb') as f:
-    pickle.dump(boxes, f)
-
-print(f'Segmentation saved in {str(output_zarr_path)}')
-print(f'Object boxes saved in {str(output_bbox_pkl)}')
+    # Save boxes on disk
+    with open(output_bbox_pkl, 'wb') as f:
+        pickle.dump(boxes, f)
+
+    print(f'Segmentation saved in {str(output_zarr_path)}')
+    print(f'Object boxes saved in {str(output_bbox_pkl)}')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/environment-rocm.yml b/environment-rocm.yml
index 3e7a1b57..a2c34e51 100644
--- a/environment-rocm.yml
+++ b/environment-rocm.yml
@@ -28,5 +28,4 @@ dependencies:
     - dask_jobqueue
     - bokeh
     - fill-voids
-    - aiohttp
-  
+    - pooch
diff --git a/environment.yml b/environment.yml
index 169b7641..b876c539 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,8 +1,10 @@
 name: cellpose
 dependencies:
-  - python==3.8.5
+  - python==3.9.23
   - pip
   - pip:
+    - --index-url https://download.pytorch.org/whl/cu126
+    - --extra-index-url https://pypi.org/simple
     - qtpy
 #    - PyQt5.sip
     - numpy>=1.20.0
@@ -26,4 +28,5 @@ dependencies:
     - dask_jobqueue
     - bokeh
     - fill-voids
+    - pooch
   

From 3af4df3da208eafe8a9f8db715c6c5acaa2bddb7 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 19 Nov 2025 08:14:31 +0100
Subject: [PATCH 10/13] doc: Add comments and add additional status prints

---
 cellpose/contrib/cluster_script.py | 70 ++++++++++++++++--------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/cellpose/contrib/cluster_script.py b/cellpose/contrib/cluster_script.py
index 37d7baa7..667ef8bd 100644
--- a/cellpose/contrib/cluster_script.py
+++ b/cellpose/contrib/cluster_script.py
@@ -8,16 +8,17 @@
 from cellpose.contrib.distributed_segmentation import numpy_array_to_zarr
 
 from cellpose.contrib.distributed_segmentation import distributed_eval
-from cellpose.contrib.distributed_segmentation import SlurmCluster, janeliaLSFCluster, myLocalCluster
-
-
+from cellpose.contrib.distributed_segmentation import SlurmCluster, janeliaLSFCluster
 
 def main():
     ## PARAMETERS
-    # Compute node accessible directory for test input zarr dataset and outputs 
-    output_dir = Path.home() / 'link_scratch'
+    # Compute node-accessible directory for input zarr dataset and outputs 
+    output_dir = Path() / 'outputs'
+    input_zarr_path = output_dir / 'input.zarr'
+    output_zarr_path = output_dir / 'segmentation.zarr'
+    output_bbox_pkl = output_dir / 'bboxes.pkl'
 
-    # Cluster parameters (here: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
+    # Cluster parameters (example: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
     cluster = {
         'job_cpu': 2,               # number of CPUs per GPU worker
         'ncpus':1,                  # threads requested per GPU worker
@@ -34,11 +35,19 @@ def main():
     }
     # * Ask your cluster support staff for assistance
 
-    input_zarr_path = output_dir / 'input.zarr'
-    output_zarr_path = output_dir / 'segmentation.zarr'
-    output_bbox_pkl = output_dir / 'bboxes.pkl'
+    # Cellpose parameters
+    model_kwargs = {'gpu':True}
+    eval_kwargs = {
+        'z_axis':0,
+        'do_3D':True,
+    }
+
+    # Optional: Crop data to reduce runtime for this test case
+    crop = (slice(0, 221), slice(1024,2048), slice(1024,2048))
 
 
+    ## DATA PREPARATION
+    # here: DAPI-stained human gastruloid by Zhiyuan Yu (https://zenodo.org/records/17590053)
     if not input_zarr_path.exists():
         print('Download test data')
         fname = retrieve(
@@ -46,23 +55,18 @@ def main():
             known_hash="8ac2d944882268fbaebdfae5f7c18e4d20fdab024db2f9f02f4f45134b936872",
             path = Path.home() / '.cellpose' / 'data',
             progressbar=True,
-        )
-        #crop = (slice(None), slice(1024,2048), slice(1024,2048))
-        data_numpy = imread(fname)#[crop]
+        )       
+        data_numpy = imread(fname)[crop]
 
-        print('Save as 3D local zarr array')
+        print(f'Convert to {data_numpy.shape} zarr array')
         data_zarr = numpy_array_to_zarr(input_zarr_path, data_numpy, chunks=(256, 256, 256))
+        print(f'Input stored in {input_zarr_path}')
         del data_numpy
     else:
+        print(f'Read input data from {input_zarr_path}')
         data_zarr = zarr.open(input_zarr_path)
 
-    # parameterize cellpose however you like
-    model_kwargs = {'gpu':True}
-    eval_kwargs = {
-        'z_axis':0,
-        'do_3D':True,
-    }
-
+    ## EVALUATION
     # Guess cluster type by checking for cluster submission commands 
     if subprocess.getstatusoutput('sbatch -h')[0] == 0:
         print('Slurm sbatch command detected -> use SlurmCluster')
@@ -72,11 +76,13 @@ def main():
         cluster = janeliaLSFCluster(**cluster_kwargs)
     else:
         cluster = None
+        ## Note in case you want to test without a cluster scheduler use:
+        #from cellpose.contrib.distributed_segmentation import myLocalCluster
         #cluster = myLocalCluster(**{
-        #'n_workers':1,    # if you only have 1 gpu, then 1 worker is the right choice
-        #'ncpus':24,
-        #'memory_limit':'64GB',
-        #'threads_per_worker':1,
+        #    'n_workers': 1,            # if you only have 1 gpu, then 1 worker is the right choice
+        #    'ncpus': 8,                
+        #    'memory_limit':'64GB',
+        #    'threads_per_worker':1,
         #})
 
     if cluster is None:
@@ -88,22 +94,22 @@ def main():
             "\n * or raise a feature request at https://github.com/MouseLand/cellpose/issues."
         )
 
-    # Start evaluation
+    # Start computation
     segments, boxes = distributed_eval(
-        input_zarr=data_zarr,
-        blocksize=(256, 256, 256),
-        write_path=str(output_zarr_path),
-        model_kwargs=model_kwargs,
-        eval_kwargs=eval_kwargs,
+        input_zarr = data_zarr,
+        blocksize = (256, 256, 256),
+        write_path = str(output_zarr_path),
+        model_kwargs = model_kwargs,
+        eval_kwargs = eval_kwargs,
         cluster = cluster,
     )
 
-    # Save boxes on disk
+    # Save bounding boxes on disk
     with open(output_bbox_pkl, 'wb') as f:
         pickle.dump(boxes, f)
 
     print(f'Segmentation saved in {str(output_zarr_path)}')
-    print(f'Object boxes saved in {str(output_bbox_pkl)}')
+    print(f'Object bounding boxes saved in {str(output_bbox_pkl)}')
 
 if __name__ == '__main__':
     main()
\ No newline at end of file

From d1d95354a9313c88c82f9cb15a2a6f23aeba65d6 Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 19 Nov 2025 12:08:44 +0100
Subject: [PATCH 11/13] [skip ci] bugfix: wrong variable name

---
 cellpose/contrib/cluster_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cellpose/contrib/cluster_script.py b/cellpose/contrib/cluster_script.py
index 667ef8bd..5fac515a 100644
--- a/cellpose/contrib/cluster_script.py
+++ b/cellpose/contrib/cluster_script.py
@@ -19,7 +19,7 @@ def main():
     output_bbox_pkl = output_dir / 'bboxes.pkl'
 
     # Cluster parameters (example: https://docs.mpcdf.mpg.de/doc/computing/viper-gpu-user-guide.html)
-    cluster = {
+    cluster_kwargs = {
         'job_cpu': 2,               # number of CPUs per GPU worker
         'ncpus':1,                  # threads requested per GPU worker
         'min_workers':1,            # min number of workers based on expected workload

From 593fa9b8fc238ea88d11e9af9abde862b59f5bda Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 19 Nov 2025 12:09:41 +0100
Subject: [PATCH 12/13] [skip ci] deps: Different index URL only relevant on
 Windows

---
 environment.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index b876c539..8302dd9b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,10 +1,11 @@
 name: cellpose
 dependencies:
   - python==3.9.23
+  - pyarrow
   - pip
   - pip:
-    - --index-url https://download.pytorch.org/whl/cu126
-    - --extra-index-url https://pypi.org/simple
+    - --index-url https://download.pytorch.org/whl/cu128 # [win] # Only for Windows: Use non-PyPi torch wheels for CUDA support
+    - --extra-index-url https://pypi.org/simple # [win] # Only for Windows: Use PyPi for everything else
     - qtpy
 #    - PyQt5.sip
     - numpy>=1.20.0

From 59f9e00aa4d3bd7c25bb9d100748dcf4af90637e Mon Sep 17 00:00:00 2001
From: Eric Jelli <eric.jelli@mpinb.mpg.de>
Date: Wed, 19 Nov 2025 13:28:12 +0100
Subject: [PATCH 13/13] [skip ci] do not install pyarrow from conda

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 8302dd9b..da3c13f5 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,7 +1,6 @@
 name: cellpose
 dependencies:
   - python==3.9.23
-  - pyarrow
   - pip
   - pip:
     - --index-url https://download.pytorch.org/whl/cu128 # [win] # Only for Windows: Use non-PyPi torch wheels for CUDA support