From fd81c7b9e1d55063eadead10dde1ae32aa4a12cc Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Sun, 29 Mar 2026 11:21:01 +0200 Subject: [PATCH 1/2] further clarify --- src/annbatch/io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index a0fd6265..5e1dda78 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -540,7 +540,7 @@ def add_adatas( shuffle: bool = True, rng: np.random.Generator | None = None, ) -> Self: - """Take AnnData paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `dataset_size` rows per dataset if running for the first time). + """Take AnnData paths (or unique ids) and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `dataset_size` rows per dataset if running for the first time). The set of AnnData datasets is collectively referred to as a "collection" where each dataset is called `dataset_i{.h5ad}`. The main purpose of this function is to create shuffled sharded zarr datasets, which is the default behavior of this function. @@ -555,11 +555,13 @@ def add_adatas( Parameters ---------- adata_paths - Paths to the AnnData files used to create the zarr store. + Paths to/unique ids for the AnnData (files) used to create the zarr store. load_adata Function to customize (lazy-)loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used with categoricals/nullables read into memory and `(-1)` chunks for `obs`. If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. Beware that concatenating nullables/categoricals (i.e., what happens if `len(adata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. + Note that this function does not have to return "lazy" `AnnData` objects nor does it have to do I/O. + This function can return in-memory objects after pre-processing and treat `adata_paths` as simply a unqiue identifier for identifying the returned object. var_subset Subset of gene names to include in the store. If None, all genes are included. Genes are subset based on the `var_names` attribute of the concatenated AnnData object. From 55f23899a622f0b38a888fde69c77de66a725525 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Sun, 29 Mar 2026 11:22:03 +0200 Subject: [PATCH 2/2] i.e., --- src/annbatch/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annbatch/io.py b/src/annbatch/io.py index 5e1dda78..32548aa4 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -561,7 +561,7 @@ def add_adatas( If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. Beware that concatenating nullables/categoricals (i.e., what happens if `len(adata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. Note that this function does not have to return "lazy" `AnnData` objects nor does it have to do I/O. - This function can return in-memory objects after pre-processing and treat `adata_paths` as simply a unqiue identifier for identifying the returned object. + This function can return in-memory objects (i.e., after pre-processing) and treat `adata_paths` as simply a unqiue identifier for identifying the returned object. var_subset Subset of gene names to include in the store. If None, all genes are included. Genes are subset based on the `var_names` attribute of the concatenated AnnData object.