Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3987.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Two new runtime config options control how the sharding codec coalesces partial-shard reads: `sharding.read.coalesce_max_gap_bytes` (default 1 MiB) and `sharding.read.coalesce_max_bytes` (default 16 MiB). When reading multiple chunks from the same shard, nearby byte ranges are merged into a single request to the store if separated by no more than `coalesce_max_gap_bytes` and the merged read stays within `coalesce_max_bytes`.
1 change: 1 addition & 0 deletions docs/user-guide/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Configuration options include the following:
- Async and threading options, e.g. `async.concurrency` and `threading.max_workers`
- Selections of implementations of codecs, codec pipelines and buffers
- Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more.
- Control request merging when reading multiple chunks from the same shard with `sharding.read.coalesce_max_gap_bytes` and `sharding.read.coalesce_max_bytes`. Reads of nearby chunks are coalesced into a single request to the store when separated by at most `coalesce_max_gap_bytes` and the resulting merged read is no larger than `coalesce_max_bytes`.

For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
first register the implementations in the registry and then select them in the config.
Expand Down
15 changes: 14 additions & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
parse_shapelike,
product,
)
from zarr.core.config import config
from zarr.core.dtype.npy.int import UInt64
from zarr.core.indexing import (
BasicIndexer,
Expand Down Expand Up @@ -483,6 +484,8 @@ async def _decode_partial_single(
chunk_spec.prototype,
chunks_per_shard,
all_chunk_coords,
max_gap_bytes=config.get("sharding.read.coalesce_max_gap_bytes"),
max_coalesced_bytes=config.get("sharding.read.coalesce_max_bytes"),
)

if shard_dict_maybe is None:
Expand Down Expand Up @@ -780,10 +783,16 @@ async def _load_partial_shard_maybe(
prototype: BufferPrototype,
chunks_per_shard: tuple[int, ...],
all_chunk_coords: set[tuple[int, ...]],
max_gap_bytes: int,
max_coalesced_bytes: int,
) -> ShardMapping | None:
"""
Read chunks from `byte_getter` for the case where the read is less than a full shard.
Returns a mapping of chunk coordinates to bytes or None.
`max_gap_bytes` and `max_coalesced_bytes` are forwarded to
`Store.get_ranges` to control byte-range coalescing across the requested
chunks.
"""
shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
if shard_index is None:
Expand All @@ -807,7 +816,11 @@ async def _load_partial_shard_maybe(
byte_ranges = [byte_range for _, byte_range in chunk_coord_byte_ranges]
try:
async for group in byte_getter.store.get_ranges(
byte_getter.path, byte_ranges, prototype=prototype
byte_getter.path,
byte_ranges,
prototype=prototype,
max_gap_bytes=max_gap_bytes,
max_coalesced_bytes=max_coalesced_bytes,
):
for idx, buf in group:
if buf is not None:
Expand Down
6 changes: 6 additions & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ def enable_gpu(self) -> ConfigSet:
},
"async": {"concurrency": 10, "timeout": None},
"threading": {"max_workers": None},
"sharding": {
"read": {
"coalesce_max_gap_bytes": 1 << 20, # 1 MiB
"coalesce_max_bytes": 16 << 20, # 16 MiB
},
},
"json_indent": 2,
"codec_pipeline": {
"path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
Expand Down
134 changes: 134 additions & 0 deletions tests/test_codecs/test_sharding_unit.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import TYPE_CHECKING, cast
from unittest.mock import AsyncMock

import numpy as np
import pytest

Expand All @@ -10,9 +13,13 @@
)
from zarr.core.buffer import default_buffer_prototype
from zarr.core.buffer.cpu import Buffer
from zarr.core.config import config
from zarr.storage._common import StorePath
from zarr.storage._memory import MemoryStore

if TYPE_CHECKING:
from zarr.core.array import ShardsConfigParam

# ============================================================================
# _ShardIndex tests
# ============================================================================
Expand Down Expand Up @@ -155,6 +162,8 @@ async def test_load_partial_shard_maybe_index_load_fails() -> None:
prototype=default_buffer_prototype(),
chunks_per_shard=(2,),
all_chunk_coords={(0,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result is None
Expand Down Expand Up @@ -187,6 +196,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,), (1,), (2,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result is not None
Expand Down Expand Up @@ -220,6 +231,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,), (1,), (2,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result == {}
Expand Down Expand Up @@ -251,6 +264,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,), (1,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result is not None
Expand Down Expand Up @@ -292,6 +307,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result is None
Expand Down Expand Up @@ -336,6 +353,8 @@ async def boom(*args: object, **kwargs: object) -> Buffer | None:
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)


Expand Down Expand Up @@ -368,6 +387,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,), (1,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result is not None
Expand Down Expand Up @@ -405,6 +426,8 @@ async def mock_load_index(
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,)},
max_gap_bytes=1 << 20,
max_coalesced_bytes=16 << 20,
)

assert result == {}
Expand Down Expand Up @@ -486,3 +509,114 @@ def test_is_total_shard_1d() -> None:
# Partial
partial_coords: set[tuple[int, ...]] = {(0,), (2,)}
assert codec._is_total_shard(partial_coords, chunks_per_shard) is False


# ============================================================================
# Coalescing config option tests
#
# Asserts that the `sharding.read.coalesce_max_gap_bytes` and
# `sharding.read.coalesce_max_bytes` config options flow through to
# `Store.get_ranges` as `max_gap_bytes` / `max_coalesced_bytes` kwargs.
# ============================================================================


def _trigger_partial_shard_read() -> AsyncMock:
"""Build a sharded array on a mocked `MemoryStore`, trigger a partial-shard
read via the public read path, and return the `get_ranges` mock.
"""
import zarr
from zarr.codecs.sharding import ShardingCodecIndexLocation

chunk_shape = (2,)
shard_shape = (8,)
data = np.arange(8, dtype="int32")

store = MemoryStore()
store_mock = AsyncMock(wraps=store, spec=store.__class__)

shards: ShardsConfigParam = {
"shape": shard_shape,
"index_location": ShardingCodecIndexLocation.end,
}
a = zarr.create_array(
StorePath(store_mock),
shape=(8,),
chunks=chunk_shape,
shards=shards,
dtype=data.dtype,
fill_value=-1,
)
a[:] = data

store_mock.reset_mock()

# Read a strict subset of chunks to take the partial-shard read path.
_ = a[0:4]

return cast(AsyncMock, store_mock.get_ranges)


def test_load_partial_shard_forwards_config_to_get_ranges() -> None:
"""`sharding.read.*` config values are forwarded to `Store.get_ranges`."""
with config.set(
{
"sharding.read.coalesce_max_gap_bytes": 4242,
"sharding.read.coalesce_max_bytes": 424242,
}
):
get_ranges_mock = _trigger_partial_shard_read()

assert get_ranges_mock.call_count >= 1
for call in get_ranges_mock.call_args_list:
kwargs = call.kwargs
assert kwargs["max_gap_bytes"] == 4242
assert kwargs["max_coalesced_bytes"] == 424242


def test_load_partial_shard_uses_config_defaults() -> None:
"""Without explicit config, defaults from `zarr.config` are forwarded."""
get_ranges_mock = _trigger_partial_shard_read()

assert get_ranges_mock.call_count >= 1
for call in get_ranges_mock.call_args_list:
kwargs = call.kwargs
assert kwargs["max_gap_bytes"] == config.get("sharding.read.coalesce_max_gap_bytes")
assert kwargs["max_coalesced_bytes"] == config.get("sharding.read.coalesce_max_bytes")


async def test_load_partial_shard_explicit_kwargs_passthrough(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""`_load_partial_shard_maybe` forwards its explicit kwargs to `get_ranges`."""
codec = ShardingCodec(chunk_shape=(2,))
chunks_per_shard = (4,)

index = _ShardIndex.create_empty(chunks_per_shard)
index.set_chunk_slice((0,), slice(0, 100))
index.set_chunk_slice((2,), slice(200, 300))

store = MemoryStore()
await store.set("shard", Buffer.from_bytes(b"x" * 300))
store_mock = AsyncMock(wraps=store, spec=store.__class__)
byte_getter = StorePath(store_mock, "shard")

async def mock_load_index(
self: ShardingCodec, byte_getter: StorePath, cps: tuple[int, ...]
) -> _ShardIndex:
return index

monkeypatch.setattr(ShardingCodec, "_load_shard_index_maybe", mock_load_index)

await codec._load_partial_shard_maybe(
byte_getter=byte_getter,
prototype=default_buffer_prototype(),
chunks_per_shard=chunks_per_shard,
all_chunk_coords={(0,), (2,)},
max_gap_bytes=12345,
max_coalesced_bytes=67890,
)

store_mock.get_ranges.assert_called_once()
kwargs = store_mock.get_ranges.call_args.kwargs
assert kwargs["max_gap_bytes"] == 12345
assert kwargs["max_coalesced_bytes"] == 67890
6 changes: 6 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def test_config_defaults_set() -> None:
},
"async": {"concurrency": 10, "timeout": None},
"threading": {"max_workers": None},
"sharding": {
"read": {
"coalesce_max_gap_bytes": 1 << 20,
"coalesce_max_bytes": 16 << 20,
},
},
"json_indent": 2,
"codec_pipeline": {
"path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
Expand Down
Loading