From deae723f9e7c91f6f1ec48d39674aed8b11a2077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:49:25 +0200 Subject: [PATCH 1/6] Add test suggested in https://github.com/fsspec/gcsfs/pull/313 --- gcsfs/tests/test_core.py | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 8573e68af..9886c5042 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1223,6 +1223,47 @@ def test_dir_marker(gcs): assert out2["type"] == "directory" +def test_dir_marker_directory_not_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + info = gcs.info(f"{TEST_BUCKET}/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_directory_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + gcs.ls(f"{TEST_BUCKET}/psudodir") + info = gcs.info(f"{TEST_BUCKET}/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_parent_directory_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/") + gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + gcs.ls(f"{TEST_BUCKET}/parent_psudodir") + info = gcs.info(f"{TEST_BUCKET}/parent_psudodir/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_info_eq_ls(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.invalidate_cache() + out1 = gcs.info(f"{TEST_BUCKET}/psudodir") + out2 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0] + assert out1["type"] == "directory" + assert out1 == out2 + + gcs.invalidate_cache() + out3 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0] + out4 = gcs.info(f"{TEST_BUCKET}/psudodir") + assert out3["type"] == "directory" + assert out3 == out4 + + def test_mkdir_with_path(gcs): with pytest.raises(FileNotFoundError): gcs.mkdir(f"{TEST_BUCKET + 'new'}/path", create_parents=False) From 1d504297f1d658d02763d30d1f282c8bc56aff07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Thu, 24 Apr 2025 13:50:32 +0200 Subject: [PATCH 2/6] Treat empty files with trailing underscore as empty folder in _ls --- gcsfs/core.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 566a318fc..2c0659262 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -985,7 +985,7 @@ def _parse_timestamp(self, timestamp): async def _info(self, path, generation=None, **kwargs): """File information about this path.""" - path = self._strip_protocol(path) + path = self._strip_protocol(path).rstrip("/") if "/" not in path: try: out = await self._call("GET", f"b/{path}", json_out=True) @@ -1014,7 +1014,7 @@ async def _info(self, path, generation=None, **kwargs): # this is a directory return { "bucket": bucket, - "name": path.rstrip("/"), + "name": path, "size": 0, "storageClass": "DIRECTORY", "type": "directory", @@ -1029,15 +1029,15 @@ async def _info(self, path, generation=None, **kwargs): pass kwargs["detail"] = True # Force to true for info out = await self._ls(path, **kwargs) - out0 = [o for o in out if o["name"].rstrip("/") == path] - if out0: + match = next((o for o in out if o["name"].rstrip("/") == path), None) + if match: # exact hit - return out0[0] + return match elif out: # other stuff - must be a directory return { "bucket": bucket, - "name": path.rstrip("/"), + "name": path, "size": 0, "storageClass": "DIRECTORY", "type": "directory", @@ -1060,13 +1060,32 @@ async def _ls( for entry in await self._list_objects( path, prefix=prefix, versions=versions, **kwargs ): + if entry["size"] == 0 and entry["name"].endswith("/"): + entry = { + "bucket": entry["bucket"], + "name": path.rstrip("/"), + "size": 0, + "storageClass": "DIRECTORY", + "type": "directory", + } + if entry in out: + continue + if versions and "generation" in entry: entry = entry.copy() entry["name"] = f"{entry['name']}#{entry['generation']}" + out.append(entry) if detail: - return out + return sorted( + out, + key=lambda e: ( + e["name"].count("/"), + e["type"] != "directory", + e["name"], + ), + ) else: return sorted([o["name"] for o in out]) @@ -1490,6 +1509,7 @@ async def _put_file( self.invalidate_cache(self._parent(rpath)) async def _isdir(self, path): + try: return (await self._info(path))["type"] == "directory" except OSError: From f3a534fe9a1498ebd3264471fabe68688663cf6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Thu, 24 Apr 2025 15:37:56 +0200 Subject: [PATCH 3/6] Limit storage.object.list to maxResults=1 from _info --- gcsfs/core.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 2c0659262..dbb514a83 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -683,7 +683,7 @@ async def _do_list_objects( end_offset=None, prefix=prefix, versions=versions, - page_size=default_page_size, + page_size=max_results, ) async def _concurrent_list_objects_helper( @@ -1023,16 +1023,15 @@ async def _info(self, path, generation=None, **kwargs): try: exact = await self._get_object(path) # this condition finds a "placeholder" - still need to check if it's a directory - if exact["size"] or not exact["name"].endswith("/"): + if not (exact["size"] == 0 and exact["name"].endswith("/")): return exact except FileNotFoundError: pass - kwargs["detail"] = True # Force to true for info - out = await self._ls(path, **kwargs) - match = next((o for o in out if o["name"].rstrip("/") == path), None) - if match: + out = await self._list_objects(path, max_results=1) + exact = next((o for o in out if o["name"].rstrip("/") == path), None) + if exact and not (exact["size"] == 0 and exact["name"].endswith("/")): # exact hit - return match + return exact elif out: # other stuff - must be a directory return { From dd3bb007ec8f6fe9cbec98a54399d3f7dfbb653e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Fri, 25 Apr 2025 14:29:42 +0200 Subject: [PATCH 4/6] Don't update cache with partial listing. --- gcsfs/core.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index dbb514a83..592bfc4ff 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -627,9 +627,11 @@ async def _list_objects(self, path, prefix="", versions=False, **kwargs): "use_snapshot_listing" ) + max_results = kwargs.get("max_results") + # Don't cache prefixed/partial listings, in addition to # not using the inventory report service to do listing directly. - if not prefix and not use_snapshot_listing: + if not prefix and not use_snapshot_listing and not max_results: self.dircache[path] = out return out @@ -683,7 +685,7 @@ async def _do_list_objects( end_offset=None, prefix=prefix, versions=versions, - page_size=max_results, + max_results=max_results, ) async def _concurrent_list_objects_helper( @@ -736,7 +738,7 @@ async def _concurrent_list_objects_helper( end_offset=end_offsets[i], prefix=prefix, versions=versions, - page_size=page_size, + max_results=page_size, ) for i in range(0, len(start_offsets)) ] @@ -761,7 +763,7 @@ async def _sequential_list_objects_helper( end_offset, prefix, versions, - page_size, + max_results, ): """ Sequential list objects within the start and end offset range. @@ -778,7 +780,7 @@ async def _sequential_list_objects_helper( prefix=prefix, startOffset=start_offset, endOffset=end_offset, - maxResults=page_size, + maxResults=max_results, json_out=True, versions="true" if versions else None, ) @@ -796,7 +798,7 @@ async def _sequential_list_objects_helper( prefix=prefix, startOffset=start_offset, endOffset=end_offset, - maxResults=page_size, + maxResults=max_results, pageToken=next_page_token, json_out=True, versions="true" if versions else None, From fc9af2889b61e63664904f057bc821ce23294e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Mon, 28 Apr 2025 11:57:41 +0200 Subject: [PATCH 5/6] Sort by directories by name. Optimize directory deduplication. --- gcsfs/core.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 592bfc4ff..5fc917b8a 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -1058,6 +1058,7 @@ async def _ls( out = await self._list_buckets() else: out = [] + dir_names = set() for entry in await self._list_objects( path, prefix=prefix, versions=versions, **kwargs ): @@ -1069,8 +1070,11 @@ async def _ls( "storageClass": "DIRECTORY", "type": "directory", } - if entry in out: + + if entry["type"] == "directory": + if entry["name"] in dir_names: continue + dir_names.add(entry["name"]) if versions and "generation" in entry: entry = entry.copy() @@ -1078,17 +1082,12 @@ async def _ls( out.append(entry) + out.sort(key=lambda e: (e["name"])) + if detail: - return sorted( - out, - key=lambda e: ( - e["name"].count("/"), - e["type"] != "directory", - e["name"], - ), - ) + return out else: - return sorted([o["name"] for o in out]) + return [o["name"] for o in out] def url(self, path): """Get HTTP URL of the given path""" From a2a9fb0e2921390777a492dbd76787905bafa8cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Lie=20Rapp?= <90246213+Bjoern-Rapp@users.noreply.github.com> Date: Mon, 28 Apr 2025 12:16:23 +0200 Subject: [PATCH 6/6] Move check to function. --- gcsfs/core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index 5fc917b8a..e4edfd2a9 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -154,6 +154,10 @@ def _coalesce_generation(*args): return generations.pop() +def _is_directory_marker(entry): + return entry["size"] == 0 and entry["name"].endswith("/") + + class GCSFileSystem(asyn.AsyncFileSystem): r""" Connect to Google Cloud Storage. @@ -1025,13 +1029,13 @@ async def _info(self, path, generation=None, **kwargs): try: exact = await self._get_object(path) # this condition finds a "placeholder" - still need to check if it's a directory - if not (exact["size"] == 0 and exact["name"].endswith("/")): + if not _is_directory_marker(exact): return exact except FileNotFoundError: pass out = await self._list_objects(path, max_results=1) exact = next((o for o in out if o["name"].rstrip("/") == path), None) - if exact and not (exact["size"] == 0 and exact["name"].endswith("/")): + if exact and not _is_directory_marker(exact): # exact hit return exact elif out: @@ -1062,7 +1066,7 @@ async def _ls( for entry in await self._list_objects( path, prefix=prefix, versions=versions, **kwargs ): - if entry["size"] == 0 and entry["name"].endswith("/"): + if _is_directory_marker(entry): entry = { "bucket": entry["bucket"], "name": path.rstrip("/"),