pydata · aladinor · Sep 13, 2025 · Sep 13, 2025 · Sep 13, 2025 · Sep 14, 2025
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -16,6 +16,9 @@ New Features
 
 - Added ``inherit='all_coords'`` option to :py:meth:`DataTree.to_dataset` to inherit
   all parent coordinates, not just indexed ones (:issue:`10812`, :pull:`11230`).
+- Added ``max_concurrency`` parameter to :py:func:`open_datatree` to control
+  the maximum number of concurrent I/O operations when opening groups in parallel
+  with the Zarr backend (:pull:`10742`).
   By `Alfonso Ladino <https://github.com/aladinor>`_.
 
 Breaking Changes
@@ -337,6 +340,9 @@ Documentation
 Performance
 ~~~~~~~~~~~
 
+- Improve performance of :py:func:`open_datatree` for zarr stores by using async/concurrent
+  loading of groups and indexes (:pull:`10742`).
+  By `Alfonso Ladino <https://github.com/aladinor>`_.
 - Add a fastpath to the backend plugin system for standard engines (:issue:`10178`, :pull:`10937`).
   By `Sam Levang <https://github.com/slevang>`_.
 - Optimize :py:class:`~xarray.coding.variables.CFMaskCoder` decoder (:pull:`11105`).

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -65,6 +65,7 @@
         NestedSequence,
         T_Chunks,
     )
+    from xarray.core.variable import Variable
 
     T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"]
     T_Engine = Union[
@@ -349,7 +350,47 @@ def _datatree_from_backend_datatree(
 
     _protect_datatree_variables_inplace(backend_tree, cache)
     if create_default_indexes:
-        tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
+        _use_zarr_async = False
+        if engine == "zarr":
+            from xarray.backends.zarr import _zarr_v3
+
+            _use_zarr_async = _zarr_v3()
+
+        if _use_zarr_async:
+            from zarr.core.sync import sync as zarr_sync
+
+            async def create_indexes_async() -> dict[str, Dataset]:
+                import asyncio
+                from concurrent.futures import ThreadPoolExecutor
+
+                executor = ThreadPoolExecutor(
+                    max_workers=10, thread_name_prefix="xarray-idx"
+                )
+                try:
+                    results: dict[str, Dataset] = {}
+
+                    async def _create_index_for_node(
+                        path: str, ds: Dataset
+                    ) -> tuple[str, Dataset]:
+                        return path, await _maybe_create_default_indexes_async(
+                            ds, executor=executor
+                        )
+
+                    tasks = [
+                        _create_index_for_node(path, node.dataset)
+                        for path, [node] in group_subtrees(backend_tree)
+                    ]
+                    for fut in asyncio.as_completed(tasks):
+                        path, ds = await fut
+                        results[path] = ds
+                    return results
+                finally:
+                    executor.shutdown(wait=True, cancel_futures=True)
+
+            results = zarr_sync(create_indexes_async())
+            tree = DataTree.from_dict(results, name=backend_tree.name)
+        else:
+            tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
     else:
         tree = backend_tree
     if chunks is not None:
@@ -386,6 +427,33 @@ def _datatree_from_backend_datatree(
     return tree
 
 
+async def _maybe_create_default_indexes_async(ds: Dataset, executor=None) -> Dataset:
+    import asyncio
+
+    to_index_names = [
+        name
+        for name, coord in ds.coords.items()
+        if coord.dims == (name,) and name not in ds.xindexes
+    ]
+
+    if not to_index_names:
+        return ds
+
+    loop = asyncio.get_running_loop()
+
+    async def load_var(var: Variable) -> Variable:
+        try:
+            return await var.load_async()
+        except NotImplementedError:
+            return await loop.run_in_executor(executor, var.load)
+
+    await asyncio.gather(*[load_var(ds.variables[name]) for name in to_index_names])
+
+    variables = {name: ds.variables[name] for name in to_index_names}
+    new_coords = Coordinates(variables)
+    return ds.assign_coords(new_coords)
+
+
 def open_dataset(
     filename_or_obj: T_PathFileOrDataStore,
     *,
@@ -882,6 +950,7 @@ def open_datatree(
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
     backend_kwargs: dict[str, Any] | None = None,
+    max_concurrency: int | None = None,
     **kwargs,
 ) -> DataTree:
     """
@@ -1014,15 +1083,26 @@ def open_datatree(
         chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
         For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed
         to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
+    max_concurrency : int, optional
+        Maximum number of concurrent I/O operations when opening groups in
+        parallel. This limits the number of groups that are loaded simultaneously.
+        Useful for controlling resource usage with large datatrees or stores
+        that may have limitations on concurrent access (e.g., icechunk).
+        Only used by backends that support parallel loading (currently Zarr v3).
+        If None (default), the backend uses its default value (typically 10).
     backend_kwargs: dict
         Additional keyword arguments passed on to the engine open function,
         equivalent to `**kwargs`.
     **kwargs: dict
         Additional keyword arguments passed on to the engine open function.
         For example:
 
-        - 'group': path to the group in the given file to open as the root group as
-          a str.
+        - 'group': path to the group in the given file to open as the root
+          group as a str.  If the string contains glob metacharacters
+          (``*``, ``?``, ``[``), it is interpreted as a pattern and only
+          groups whose paths match are loaded (along with their ancestors).
+          For example, ``group="*/sweep_0"`` loads every ``sweep_0`` one
+          level deep while skipping sibling groups.
         - 'lock': resource lock to use when reading data from disk. Only
           relevant when using dask or another form of parallelism. By default,
           appropriate locks are chosen to safely read and write files with the
@@ -1074,6 +1154,9 @@ def open_datatree(
     )
     overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
 
+    if max_concurrency is not None:
+        kwargs["max_concurrency"] = max_concurrency
+
     backend_tree = backend.open_datatree(
         filename_or_obj,
         drop_variables=drop_variables,
@@ -1265,8 +1348,12 @@ def open_groups(
         Additional keyword arguments passed on to the engine open function.
         For example:
 
-        - 'group': path to the group in the given file to open as the root group as
-          a str.
+        - 'group': path to the group in the given file to open as the root
+          group as a str.  If the string contains glob metacharacters
+          (``*``, ``?``, ``[``), it is interpreted as a pattern and only
+          groups whose paths match are loaded (along with their ancestors).
+          For example, ``group="*/sweep_0"`` loads every ``sweep_0`` one
+          level deep while skipping sibling groups.
         - 'lock': resource lock to use when reading data from disk. Only
           relevant when using dask or another form of parallelism. By default,
           appropriate locks are chosen to safely read and write files with the

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -248,6 +248,37 @@ def _iter_nc_groups(root, parent="/"):
         yield from _iter_nc_groups(group, parent=gpath)
 
 
+def _is_glob_pattern(pattern: str) -> bool:
+    return any(c in pattern for c in "*?[")
+
+
+def _filter_group_paths(group_paths: Iterable[str], pattern: str) -> list[str]:
+    from xarray.core.treenode import NodePath
+
+    matched: set[str] = {"/"}
+    for path in group_paths:
+        np_ = NodePath(path)
+        if np_.match(pattern):
+            matched.add(path)
+            for parent in np_.parents:
+                p = str(parent)
+                if p:
+                    matched.add(p)
+
+    return [p for p in group_paths if p in matched]
+
+
+def _resolve_group_and_filter(
+    group: str | None,
+    all_group_paths: list[str],
+) -> tuple[str | None, list[str]]:
+    if group is None:
+        return None, all_group_paths
+    if _is_glob_pattern(group):
+        return None, _filter_group_paths(all_group_paths, group)
+    return group, all_group_paths
+
+
 def find_root_and_group(ds):
     """Find the root and group name of a netCDF4/h5netcdf dataset."""
     hierarchy = ()

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -655,7 +655,11 @@ def open_groups_as_dict(
         open_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ) -> dict[str, Dataset]:
-        from xarray.backends.common import _iter_nc_groups
+        from xarray.backends.common import (
+            _is_glob_pattern,
+            _iter_nc_groups,
+            _resolve_group_and_filter,
+        )
         from xarray.core.treenode import NodePath
         from xarray.core.utils import close_on_error
 
@@ -664,10 +668,12 @@ def open_groups_as_dict(
         emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)
 
         filename_or_obj = _normalize_filename_or_obj(filename_or_obj)
+
+        effective_group = None if (group and _is_glob_pattern(group)) else group
         store = H5NetCDFStore.open(
             filename_or_obj,
             format=format,
-            group=group,
+            group=effective_group,
             lock=lock,
             invalid_netcdf=invalid_netcdf,
             phony_dims=phony_dims,
@@ -678,15 +684,17 @@ def open_groups_as_dict(
             open_kwargs=open_kwargs,
         )
 
-        # Check for a group and make it a parent if it exists
-        if group:
-            parent = NodePath("/") / NodePath(group)
+        if effective_group:
+            parent = NodePath("/") / NodePath(effective_group)
         else:
             parent = NodePath("/")
 
         manager = store._manager
+        all_group_paths = list(_iter_nc_groups(store.ds, parent=parent))
+        _, filtered_paths = _resolve_group_and_filter(group, all_group_paths)
+
         groups_dict = {}
-        for path_group in _iter_nc_groups(store.ds, parent=parent):
+        for path_group in filtered_paths:
             group_store = H5NetCDFStore(manager, group=path_group, **kwargs)
             store_entrypoint = StoreBackendEntrypoint()
             with close_on_error(group_store):
@@ -701,7 +709,7 @@ def open_groups_as_dict(
                     decode_timedelta=decode_timedelta,
                 )
 
-            if group:
+            if effective_group:
                 group_name = str(NodePath(path_group).relative_to(parent))
             else:
                 group_name = str(NodePath(path_group))

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -859,13 +859,19 @@ def open_groups_as_dict(
         autoclose=False,
         **kwargs,
     ) -> dict[str, Dataset]:
-        from xarray.backends.common import _iter_nc_groups
+        from xarray.backends.common import (
+            _is_glob_pattern,
+            _iter_nc_groups,
+            _resolve_group_and_filter,
+        )
         from xarray.core.treenode import NodePath
 
         filename_or_obj = _normalize_path(filename_or_obj)
+
+        effective_group = None if (group and _is_glob_pattern(group)) else group
         store = NetCDF4DataStore.open(
             filename_or_obj,
-            group=group,
+            group=effective_group,
             format=format,
             clobber=clobber,
             diskless=diskless,
@@ -875,15 +881,17 @@ def open_groups_as_dict(
             autoclose=autoclose,
         )
 
-        # Check for a group and make it a parent if it exists
-        if group:
-            parent = NodePath("/") / NodePath(group)
+        if effective_group:
+            parent = NodePath("/") / NodePath(effective_group)
         else:
             parent = NodePath("/")
 
         manager = store._manager
+        all_group_paths = list(_iter_nc_groups(store.ds, parent=parent))
+        _, filtered_paths = _resolve_group_and_filter(group, all_group_paths)
+
         groups_dict = {}
-        for path_group in _iter_nc_groups(store.ds, parent=parent):
+        for path_group in filtered_paths:
             group_store = NetCDF4DataStore(manager, group=path_group, **kwargs)
             store_entrypoint = StoreBackendEntrypoint()
             with close_on_error(group_store):
@@ -897,7 +905,7 @@ def open_groups_as_dict(
                     use_cftime=use_cftime,
                     decode_timedelta=decode_timedelta,
                 )
-            if group:
+            if effective_group:
                 group_name = str(NodePath(path_group).relative_to(parent))
             else:
                 group_name = str(NodePath(path_group))

diff --git a/xarray/backends/store.py b/xarray/backends/store.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 from collections.abc import Iterable
 from typing import TYPE_CHECKING
 
@@ -72,5 +73,37 @@ def open_dataset(
 
         return ds
 
+    async def open_dataset_async(
+        self,
+        filename_or_obj: T_PathFileOrDataStore,
+        *,
+        mask_and_scale=True,
+        decode_times=True,
+        concat_characters=True,
+        decode_coords=True,
+        drop_variables: str | Iterable[str] | None = None,
+        set_indexes: bool = True,
+        use_cftime=None,
+        decode_timedelta=None,
+    ) -> Dataset:
+        """Async version of open_dataset.
+
+        Offloads the entire open_dataset operation to a thread to avoid blocking
+        the event loop. This is necessary because decode_cf_variables can trigger
+        data reads (e.g., for time decoding) which may use synchronous I/O.
+        """
+        return await asyncio.to_thread(
+            self.open_dataset,
+            filename_or_obj,
+            mask_and_scale=mask_and_scale,
+            decode_times=decode_times,
+            concat_characters=concat_characters,
+            decode_coords=decode_coords,
+            drop_variables=drop_variables,
+            set_indexes=set_indexes,
+            use_cftime=use_cftime,
+            decode_timedelta=decode_timedelta,
+        )
+
 
 BACKEND_ENTRYPOINTS["store"] = (None, StoreBackendEntrypoint)