Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 40 additions & 16 deletions gcsfs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ def _coalesce_generation(*args):
return generations.pop()


def _is_directory_marker(entry):
return entry["size"] == 0 and entry["name"].endswith("/")


class GCSFileSystem(asyn.AsyncFileSystem):
r"""
Connect to Google Cloud Storage.
Expand Down Expand Up @@ -627,9 +631,11 @@ async def _list_objects(self, path, prefix="", versions=False, **kwargs):
"use_snapshot_listing"
)

max_results = kwargs.get("max_results")

# Don't cache prefixed/partial listings, in addition to
# not using the inventory report service to do listing directly.
if not prefix and not use_snapshot_listing:
if not prefix and not use_snapshot_listing and not max_results:
self.dircache[path] = out
return out

Expand Down Expand Up @@ -683,7 +689,7 @@ async def _do_list_objects(
end_offset=None,
prefix=prefix,
versions=versions,
page_size=default_page_size,
max_results=max_results,
)

async def _concurrent_list_objects_helper(
Expand Down Expand Up @@ -736,7 +742,7 @@ async def _concurrent_list_objects_helper(
end_offset=end_offsets[i],
prefix=prefix,
versions=versions,
page_size=page_size,
max_results=page_size,
)
for i in range(0, len(start_offsets))
]
Expand All @@ -761,7 +767,7 @@ async def _sequential_list_objects_helper(
end_offset,
prefix,
versions,
page_size,
max_results,
):
"""
Sequential list objects within the start and end offset range.
Expand All @@ -778,7 +784,7 @@ async def _sequential_list_objects_helper(
prefix=prefix,
startOffset=start_offset,
endOffset=end_offset,
maxResults=page_size,
maxResults=max_results,
json_out=True,
versions="true" if versions else None,
)
Expand All @@ -796,7 +802,7 @@ async def _sequential_list_objects_helper(
prefix=prefix,
startOffset=start_offset,
endOffset=end_offset,
maxResults=page_size,
maxResults=max_results,
pageToken=next_page_token,
json_out=True,
versions="true" if versions else None,
Expand Down Expand Up @@ -985,7 +991,7 @@ def _parse_timestamp(self, timestamp):

async def _info(self, path, generation=None, **kwargs):
"""File information about this path."""
path = self._strip_protocol(path)
path = self._strip_protocol(path).rstrip("/")
if "/" not in path:
try:
out = await self._call("GET", f"b/{path}", json_out=True)
Expand Down Expand Up @@ -1014,7 +1020,7 @@ async def _info(self, path, generation=None, **kwargs):
# this is a directory
return {
"bucket": bucket,
"name": path.rstrip("/"),
"name": path,
"size": 0,
"storageClass": "DIRECTORY",
"type": "directory",
Expand All @@ -1023,21 +1029,20 @@ async def _info(self, path, generation=None, **kwargs):
try:
exact = await self._get_object(path)
# this condition finds a "placeholder" - still need to check if it's a directory
if exact["size"] or not exact["name"].endswith("/"):
if not _is_directory_marker(exact):
return exact
except FileNotFoundError:
pass
kwargs["detail"] = True # Force to true for info
out = await self._ls(path, **kwargs)
out0 = [o for o in out if o["name"].rstrip("/") == path]
if out0:
out = await self._list_objects(path, max_results=1)
exact = next((o for o in out if o["name"].rstrip("/") == path), None)
if exact and not _is_directory_marker(exact):
# exact hit
return out0[0]
return exact
elif out:
# other stuff - must be a directory
return {
"bucket": bucket,
"name": path.rstrip("/"),
"name": path,
"size": 0,
"storageClass": "DIRECTORY",
"type": "directory",
Expand All @@ -1057,18 +1062,36 @@ async def _ls(
out = await self._list_buckets()
else:
out = []
dir_names = set()
for entry in await self._list_objects(
path, prefix=prefix, versions=versions, **kwargs
):
if _is_directory_marker(entry):
entry = {
"bucket": entry["bucket"],
"name": path.rstrip("/"),
"size": 0,
"storageClass": "DIRECTORY",
"type": "directory",
}

if entry["type"] == "directory":
if entry["name"] in dir_names:
continue
dir_names.add(entry["name"])

if versions and "generation" in entry:
entry = entry.copy()
entry["name"] = f"{entry['name']}#{entry['generation']}"

out.append(entry)

out.sort(key=lambda e: (e["name"]))

if detail:
return out
else:
return sorted([o["name"] for o in out])
return [o["name"] for o in out]

def url(self, path):
"""Get HTTP URL of the given path"""
Expand Down Expand Up @@ -1490,6 +1513,7 @@ async def _put_file(
self.invalidate_cache(self._parent(rpath))

async def _isdir(self, path):

try:
return (await self._info(path))["type"] == "directory"
except OSError:
Expand Down
41 changes: 41 additions & 0 deletions gcsfs/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,47 @@ def test_dir_marker(gcs):
assert out2["type"] == "directory"


def test_dir_marker_directory_not_listed(gcs):
gcs.touch(f"{TEST_BUCKET}/psudodir/")
gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile")
gcs.invalidate_cache()
info = gcs.info(f"{TEST_BUCKET}/psudodir")
assert info["type"] == "directory"


def test_dir_marker_directory_listed(gcs):
gcs.touch(f"{TEST_BUCKET}/psudodir/")
gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile")
gcs.invalidate_cache()
gcs.ls(f"{TEST_BUCKET}/psudodir")
info = gcs.info(f"{TEST_BUCKET}/psudodir")
assert info["type"] == "directory"


def test_dir_marker_parent_directory_listed(gcs):
gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/")
gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/innerfolder/innerfile")
gcs.invalidate_cache()
gcs.ls(f"{TEST_BUCKET}/parent_psudodir")
info = gcs.info(f"{TEST_BUCKET}/parent_psudodir/psudodir")
assert info["type"] == "directory"


def test_dir_marker_info_eq_ls(gcs):
gcs.touch(f"{TEST_BUCKET}/psudodir/")
gcs.invalidate_cache()
out1 = gcs.info(f"{TEST_BUCKET}/psudodir")
out2 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0]
assert out1["type"] == "directory"
assert out1 == out2

gcs.invalidate_cache()
out3 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0]
out4 = gcs.info(f"{TEST_BUCKET}/psudodir")
assert out3["type"] == "directory"
assert out3 == out4


def test_mkdir_with_path(gcs):
with pytest.raises(FileNotFoundError):
gcs.mkdir(f"{TEST_BUCKET + 'new'}/path", create_parents=False)
Expand Down
Loading