diff --git a/gcsfs/core.py b/gcsfs/core.py index 566a318fc..e4edfd2a9 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -154,6 +154,10 @@ def _coalesce_generation(*args): return generations.pop() +def _is_directory_marker(entry): + return entry["size"] == 0 and entry["name"].endswith("/") + + class GCSFileSystem(asyn.AsyncFileSystem): r""" Connect to Google Cloud Storage. @@ -627,9 +631,11 @@ async def _list_objects(self, path, prefix="", versions=False, **kwargs): "use_snapshot_listing" ) + max_results = kwargs.get("max_results") + # Don't cache prefixed/partial listings, in addition to # not using the inventory report service to do listing directly. - if not prefix and not use_snapshot_listing: + if not prefix and not use_snapshot_listing and not max_results: self.dircache[path] = out return out @@ -683,7 +689,7 @@ async def _do_list_objects( end_offset=None, prefix=prefix, versions=versions, - page_size=default_page_size, + max_results=max_results, ) async def _concurrent_list_objects_helper( @@ -736,7 +742,7 @@ async def _concurrent_list_objects_helper( end_offset=end_offsets[i], prefix=prefix, versions=versions, - page_size=page_size, + max_results=page_size, ) for i in range(0, len(start_offsets)) ] @@ -761,7 +767,7 @@ async def _sequential_list_objects_helper( end_offset, prefix, versions, - page_size, + max_results, ): """ Sequential list objects within the start and end offset range. @@ -778,7 +784,7 @@ async def _sequential_list_objects_helper( prefix=prefix, startOffset=start_offset, endOffset=end_offset, - maxResults=page_size, + maxResults=max_results, json_out=True, versions="true" if versions else None, ) @@ -796,7 +802,7 @@ async def _sequential_list_objects_helper( prefix=prefix, startOffset=start_offset, endOffset=end_offset, - maxResults=page_size, + maxResults=max_results, pageToken=next_page_token, json_out=True, versions="true" if versions else None, @@ -985,7 +991,7 @@ def _parse_timestamp(self, timestamp): async def _info(self, path, generation=None, **kwargs): """File information about this path.""" - path = self._strip_protocol(path) + path = self._strip_protocol(path).rstrip("/") if "/" not in path: try: out = await self._call("GET", f"b/{path}", json_out=True) @@ -1014,7 +1020,7 @@ async def _info(self, path, generation=None, **kwargs): # this is a directory return { "bucket": bucket, - "name": path.rstrip("/"), + "name": path, "size": 0, "storageClass": "DIRECTORY", "type": "directory", @@ -1023,21 +1029,20 @@ async def _info(self, path, generation=None, **kwargs): try: exact = await self._get_object(path) # this condition finds a "placeholder" - still need to check if it's a directory - if exact["size"] or not exact["name"].endswith("/"): + if not _is_directory_marker(exact): return exact except FileNotFoundError: pass - kwargs["detail"] = True # Force to true for info - out = await self._ls(path, **kwargs) - out0 = [o for o in out if o["name"].rstrip("/") == path] - if out0: + out = await self._list_objects(path, max_results=1) + exact = next((o for o in out if o["name"].rstrip("/") == path), None) + if exact and not _is_directory_marker(exact): # exact hit - return out0[0] + return exact elif out: # other stuff - must be a directory return { "bucket": bucket, - "name": path.rstrip("/"), + "name": path, "size": 0, "storageClass": "DIRECTORY", "type": "directory", @@ -1057,18 +1062,36 @@ async def _ls( out = await self._list_buckets() else: out = [] + dir_names = set() for entry in await self._list_objects( path, prefix=prefix, versions=versions, **kwargs ): + if _is_directory_marker(entry): + entry = { + "bucket": entry["bucket"], + "name": path.rstrip("/"), + "size": 0, + "storageClass": "DIRECTORY", + "type": "directory", + } + + if entry["type"] == "directory": + if entry["name"] in dir_names: + continue + dir_names.add(entry["name"]) + if versions and "generation" in entry: entry = entry.copy() entry["name"] = f"{entry['name']}#{entry['generation']}" + out.append(entry) + out.sort(key=lambda e: (e["name"])) + if detail: return out else: - return sorted([o["name"] for o in out]) + return [o["name"] for o in out] def url(self, path): """Get HTTP URL of the given path""" @@ -1490,6 +1513,7 @@ async def _put_file( self.invalidate_cache(self._parent(rpath)) async def _isdir(self, path): + try: return (await self._info(path))["type"] == "directory" except OSError: diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index 8573e68af..9886c5042 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -1223,6 +1223,47 @@ def test_dir_marker(gcs): assert out2["type"] == "directory" +def test_dir_marker_directory_not_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + info = gcs.info(f"{TEST_BUCKET}/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_directory_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.touch(f"{TEST_BUCKET}/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + gcs.ls(f"{TEST_BUCKET}/psudodir") + info = gcs.info(f"{TEST_BUCKET}/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_parent_directory_listed(gcs): + gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/") + gcs.touch(f"{TEST_BUCKET}/parent_psudodir/psudodir/innerfolder/innerfile") + gcs.invalidate_cache() + gcs.ls(f"{TEST_BUCKET}/parent_psudodir") + info = gcs.info(f"{TEST_BUCKET}/parent_psudodir/psudodir") + assert info["type"] == "directory" + + +def test_dir_marker_info_eq_ls(gcs): + gcs.touch(f"{TEST_BUCKET}/psudodir/") + gcs.invalidate_cache() + out1 = gcs.info(f"{TEST_BUCKET}/psudodir") + out2 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0] + assert out1["type"] == "directory" + assert out1 == out2 + + gcs.invalidate_cache() + out3 = gcs.ls(f"{TEST_BUCKET}/psudodir", detail=True)[0] + out4 = gcs.info(f"{TEST_BUCKET}/psudodir") + assert out3["type"] == "directory" + assert out3 == out4 + + def test_mkdir_with_path(gcs): with pytest.raises(FileNotFoundError): gcs.mkdir(f"{TEST_BUCKET + 'new'}/path", create_parents=False)