Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion changes/3924.bugfix.md

This file was deleted.

18 changes: 9 additions & 9 deletions src/zarr/storage/_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from zarr.core.buffer import Buffer
from zarr.errors import ZarrUserWarning
from zarr.storage._utils import _join_paths, normalize_path
from zarr.storage._utils import _dereference_path

if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterable
Expand Down Expand Up @@ -127,7 +127,7 @@ def __init__(
) -> None:
super().__init__(read_only=read_only)
self.fs = fs
self.path = normalize_path(path)
self.path = path
self.allowed_exceptions = allowed_exceptions

if not self.fs.async_impl:
Expand Down Expand Up @@ -282,7 +282,7 @@ async def get(
# docstring inherited
if not self._is_open:
await self._open()
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)

try:
if byte_range is None:
Expand Down Expand Up @@ -329,7 +329,7 @@ async def set(
raise TypeError(
f"FsspecStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead."
)
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
# write data
if byte_range:
raise NotImplementedError
Expand All @@ -338,7 +338,7 @@ async def set(
async def delete(self, key: str) -> None:
# docstring inherited
self._check_writable()
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
try:
await self.fs._rm(path)
except FileNotFoundError:
Expand All @@ -354,14 +354,14 @@ async def delete_dir(self, prefix: str) -> None:
)
self._check_writable()

path_to_delete = _join_paths([self.path, prefix])
path_to_delete = _dereference_path(self.path, prefix)

with suppress(*self.allowed_exceptions):
await self.fs._rm(path_to_delete, recursive=True)

async def exists(self, key: str) -> bool:
# docstring inherited
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
exists: bool = await self.fs._exists(path)
return exists

Expand All @@ -378,7 +378,7 @@ async def get_partial_values(
starts: list[int | None] = []
stops: list[int | None] = []
for key, byte_range in key_ranges:
paths.append(_join_paths([self.path, key]))
paths.append(_dereference_path(self.path, key))
if byte_range is None:
starts.append(None)
stops.append(None)
Expand Down Expand Up @@ -429,7 +429,7 @@ async def list_prefix(self, prefix: str) -> AsyncIterator[str]:
yield onefile.removeprefix(f"{self.path}/")

async def getsize(self, key: str) -> int:
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
info = await self.fs._info(path)

size = info.get("size")
Expand Down
47 changes: 47 additions & 0 deletions src/zarr/storage/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,53 @@ def _join_paths(paths: Iterable[str]) -> str:
return "/".join(filter(lambda v: v != "", paths))


def _dereference_path(root: str, path: str) -> str:
"""
Combine a store-side root with a key into a single fully-qualified path.

Unlike `_join_paths`, this is purpose-built for the case where `root` is
an opaque backend-side prefix that may use `"/"` as a sentinel for "root
of the filesystem" (notably for fsspec's `ReferenceFileSystem`). A
trailing `"/"` is stripped from `root` before joining; if `root` is then
empty, the bare `path` is returned so that joining `"/"` with `"key"`
yields `"key"` rather than `"//key"`. A trailing `"/"` on the result is
also stripped.

Leading slashes on `root` are preserved -- a backend-side path like
`"/home/foo/data.zarr"` is an absolute filesystem path for
`LocalFileSystem` and must not lose its leading separator.

Parameters
----------
root : str
The backend-side root of a store. May be `""`, `"/"`, an absolute
filesystem path, or a backend-specific prefix.
path : str
The key within the store, typically a zarr key like `"zarr.json"`
or `"a/b/c/zarr.json"`.

Returns
-------
str
`root` and `path` joined by a single `"/"`, with the `"/"` sentinel
collapsed and trailing slashes removed.

Examples
--------
```python
from zarr.storage._utils import _dereference_path
_dereference_path("/", "zarr.json") # 'zarr.json'
_dereference_path("", "zarr.json") # 'zarr.json'
_dereference_path("/home/foo", "zarr.json") # '/home/foo/zarr.json'
_dereference_path("/home/foo/", "zarr.json") # '/home/foo/zarr.json'
_dereference_path("bucket/p", "zarr.json") # 'bucket/p/zarr.json'
```
"""
root = root.rstrip("/")
path = f"{root}/{path}" if root else path
return path.rstrip("/")


def _relativize_path(*, path: str, prefix: str) -> str:
"""
Make a "/"-delimited path relative to some prefix. If the prefix is '', then the path is
Expand Down
124 changes: 106 additions & 18 deletions tests/test_store/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from zarr.errors import ZarrUserWarning
from zarr.storage import FsspecStore
from zarr.storage._fsspec import _make_async
from zarr.storage._utils import normalize_path
from zarr.testing.store import StoreTests

if TYPE_CHECKING:
Expand Down Expand Up @@ -287,25 +286,114 @@ def array_roundtrip(store: FsspecStore) -> None:
np.testing.assert_array_equal(arr[:], data)


@pytest.mark.skipif(
parse_version(fsspec.__version__) < parse_version("2024.12.0"),
reason="No AsyncFileSystemWrapper",
@pytest.mark.parametrize(
("root", "key", "expected"),
[
# `"/"` as root collapses so that bare-key backends (notably
# ReferenceFileSystem) get the right key. Regression test for
# https://github.com/zarr-developers/zarr-python/issues/3922 .
("/", "zarr.json", "zarr.json"),
("", "zarr.json", "zarr.json"),
# Trailing slashes on the root are stripped before joining.
("foo/", "zarr.json", "foo/zarr.json"),
("foo", "zarr.json", "foo/zarr.json"),
# Leading slashes on the root are preserved -- absolute filesystem
# paths must stay absolute. Regression test for the titiler-xarray
# breakage that #3924 introduced when `normalize_path` was applied to
# `FsspecStore.path`.
("/home/runner/data.zarr", "zarr.json", "/home/runner/data.zarr/zarr.json"),
("/home/runner/data.zarr/", "zarr.json", "/home/runner/data.zarr/zarr.json"),
# Multi-segment keys.
("/home/foo", "a/b/zarr.json", "/home/foo/a/b/zarr.json"),
("", "a/b/zarr.json", "a/b/zarr.json"),
# Trailing slash on the result is stripped (relevant when key is "").
("/home/foo", "", "/home/foo"),
],
)
@pytest.mark.parametrize("path", ["", "/", "//", "foo", "foo/", "/foo", "/foo/", "//foo//"])
def test_fsspec_store_path_normalization(path: str) -> None:
"""`FsspecStore.path` is normalized to the canonical form, matching
`normalize_path`, regardless of the surface representation the caller
supplies.

Regression test for https://github.com/zarr-developers/zarr-python/issues/3922
-- when a caller passed `path="/"` the leading slash flowed through
unmodified to subsequent `_join_paths([self.path, key])` calls, producing
`"//key"` and missing the underlying object.
def test_dereference_path(root: str, key: str, expected: str) -> None:
"""Verify the contract `_dereference_path` provides for `FsspecStore`.

`FsspecStore.path` is stored verbatim; the join with a key must collapse a
sentinel `"/"` root, strip trailing slashes, and preserve leading
slashes on absolute paths.
"""
from zarr.storage._utils import _dereference_path

assert _dereference_path(root, key) == expected


async def test_fsspec_store_open_group_via_reference_filesystem() -> None:
"""End-to-end regression test for
https://github.com/zarr-developers/zarr-python/issues/3922 .

``ReferenceFileSystem`` keys its refs by bare strings like ``"zarr.json"``.
The bug was that ``FsspecStore(fs=ref_fs, path="/")`` produced
``"//zarr.json"`` at the join site and failed to find the entry, raising
``GroupNotFoundError``. This test pins ``path="/"`` explicitly to keep
coverage even if the default value changes later.
"""
import json

from fsspec.implementations.reference import ReferenceFileSystem

group_json = json.dumps({"zarr_format": 3, "node_type": "group", "attributes": {}})
fs = ReferenceFileSystem(
fo={"version": 1, "refs": {"zarr.json": group_json}},
asynchronous=True,
)
store = FsspecStore(fs=fs, path="/", read_only=True)
group = await zarr.api.asynchronous.open_group(store, mode="r")
assert group.metadata.zarr_format == 3


async def test_fsspec_store_read_array_chunk_via_reference_filesystem() -> None:
"""End-to-end regression test that exercises the byte-range read path
against ``ReferenceFileSystem``.

Beyond opening a group (covered by
``test_fsspec_store_open_group_via_reference_filesystem``), this test
constructs a small zarr v3 array whose chunk lives in the refs dict and
reads it through the store. Path-handling bugs on the byte-range
fetch path (used by kerchunk-style virtualization) would surface here
rather than at metadata-open time.
"""
sync_fs = fsspec.filesystem("memory")
fs = _make_async(sync_fs)
store = FsspecStore(fs=fs, path=path)
assert store.path == normalize_path(path)
import json

import numpy as np
from fsspec.implementations.reference import ReferenceFileSystem

# Construct a minimal v3 zarr: a single 1-D uint8 array of length 4 with
# one chunk of size 4. The chunk bytes are little-endian uint8s 1..4.
array_meta = json.dumps(
{
"zarr_format": 3,
"node_type": "array",
"shape": [4],
"chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [4]}},
"data_type": "uint8",
"chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
"fill_value": 0,
"codecs": [{"name": "bytes", "configuration": {"endian": "little"}}],
"attributes": {},
}
)
chunk_bytes = bytes([1, 2, 3, 4])

refs: dict[str, str] = {
"zarr.json": array_meta,
# ReferenceFileSystem accepts raw bytes via base64 encoding or
# latin-1-decoded strings; latin-1 round-trips bytes 1:1.
"c/0": chunk_bytes.decode("latin-1"),
}

fs = ReferenceFileSystem(
fo={"version": 1, "refs": refs},
asynchronous=True,
)
store = FsspecStore(fs=fs, path="/", read_only=True)
array = await zarr.api.asynchronous.open_array(store=store, mode="r")
data = await array.getitem(slice(None))
np.testing.assert_array_equal(data, np.array([1, 2, 3, 4], dtype="uint8"))


@pytest.mark.skipif(
Expand Down
Loading