diff --git a/src/annbatch/io.py b/src/annbatch/io.py index a0fd626..32548aa 100644 --- a/src/annbatch/io.py +++ b/src/annbatch/io.py @@ -540,7 +540,7 @@ def add_adatas( shuffle: bool = True, rng: np.random.Generator | None = None, ) -> Self: - """Take AnnData paths and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `dataset_size` rows per dataset if running for the first time). + """Take AnnData paths (or unique ids) and create or add to an on-disk set of AnnData datasets with uniform var spaces at the desired path (with `dataset_size` rows per dataset if running for the first time). The set of AnnData datasets is collectively referred to as a "collection" where each dataset is called `dataset_i{.h5ad}`. The main purpose of this function is to create shuffled sharded zarr datasets, which is the default behavior of this function. @@ -555,11 +555,13 @@ def add_adatas( Parameters ---------- adata_paths - Paths to the AnnData files used to create the zarr store. + Paths to/unique ids for the AnnData (files) used to create the zarr store. load_adata Function to customize (lazy-)loading the invidiual input anndata files. By default, :func:`anndata.experimental.read_lazy` is used with categoricals/nullables read into memory and `(-1)` chunks for `obs`. If you only need a subset of the input anndata files' elems (e.g., only `X` and certain `obs` columns), you can provide a custom function here to speed up loading and harmonize your data. Beware that concatenating nullables/categoricals (i.e., what happens if `len(adata_paths) > 1` internally in this function) from {class}`anndata.experimental.backed.Dataset2D` `obs` is very time consuming - consider loading these into memory if you use this argument. + Note that this function does not have to return "lazy" `AnnData` objects nor does it have to do I/O. + This function can return in-memory objects (i.e., after pre-processing) and treat `adata_paths` as simply a unqiue identifier for identifying the returned object. var_subset Subset of gene names to include in the store. If None, all genes are included. Genes are subset based on the `var_names` attribute of the concatenated AnnData object.