diff --git a/gcsfs/extended_gcsfs.py b/gcsfs/extended_gcsfs.py index 175890af..dd801457 100644 --- a/gcsfs/extended_gcsfs.py +++ b/gcsfs/extended_gcsfs.py @@ -96,6 +96,9 @@ def __init__(self, *args, finalize_on_close=False, **kwargs): finalize_on_close : bool, default False By default, files in zonal buckets are left unfinalized to allow appends. **kwargs : dict + - cache_unknown_buckets : bool, default False + Whether to cache UNKNOWN bucket types. Useful when users lack permissions + for the Storage Control API to avoid repeated slow failing lookups. Additional arguments passed to GCSFileSystem. Supports retry configuration overrides for Storage Control API: - retry_timeout: Total time to spend retrying (seconds). @@ -104,6 +107,7 @@ def __init__(self, *args, finalize_on_close=False, **kwargs): - retry_multiplier: Multiplier for delay between retries. These map to `google.api_core.retry.AsyncRetry` arguments (without 'retry_' prefix). """ + self._cache_unknown_buckets = kwargs.pop("cache_unknown_buckets", False) valid_keys = DEFAULT_RETRY_CONFIG.keys() self.retry_config = { k[6:]: v @@ -194,8 +198,7 @@ async def _lookup_bucket_type(self, bucket): if bucket in self._storage_layout_cache: return self._storage_layout_cache[bucket] bucket_type = await self._get_bucket_type(bucket) - # Dont cache UNKNOWN type - if bucket_type == BucketType.UNKNOWN: + if bucket_type == BucketType.UNKNOWN and not self._cache_unknown_buckets: return bucket_type self._storage_layout_cache[bucket] = bucket_type return self._storage_layout_cache[bucket] diff --git a/gcsfs/tests/test_extended_gcsfs.py b/gcsfs/tests/test_extended_gcsfs.py index 906f8f22..cdc02a8e 100644 --- a/gcsfs/tests/test_extended_gcsfs.py +++ b/gcsfs/tests/test_extended_gcsfs.py @@ -1611,3 +1611,55 @@ async def test_cat_file_non_zonal_fallback(extended_gcsfs): mock_super_cat.assert_awaited_once_with( "standard_bucket/obj", start=10, end=20, concurrency=2, custom_arg="val" ) + + +@pytest.mark.asyncio +async def test_lookup_bucket_type_not_cached_unknown(extended_gcsfs): + """Test that BucketType.UNKNOWN is not cached when _cache_unknown_buckets is False.""" + fs = extended_gcsfs + fs._cache_unknown_buckets = False + + # Clear cache just in case + fs._storage_layout_cache.clear() + + # Mock _get_bucket_type to return UNKNOWN + with mock.patch.object( + fs, "_get_bucket_type", new_callable=mock.AsyncMock + ) as mock_get_type: + mock_get_type.return_value = BucketType.UNKNOWN + + # First lookup + type1 = await fs._lookup_bucket_type("my-bucket") + assert type1 == BucketType.UNKNOWN + assert mock_get_type.call_count == 1 + + # Second lookup should call _get_bucket_type again because it's not cached + type2 = await fs._lookup_bucket_type("my-bucket") + assert type2 == BucketType.UNKNOWN + assert mock_get_type.call_count == 2 + + +@pytest.mark.asyncio +async def test_lookup_bucket_type_cached_unknown(extended_gcsfs): + """Test that BucketType.UNKNOWN is cached when _cache_unknown_buckets is True.""" + fs = extended_gcsfs + fs._cache_unknown_buckets = True + + # Clear cache just in case + fs._storage_layout_cache.clear() + + # Mock _get_bucket_type to return UNKNOWN + with mock.patch.object( + fs, "_get_bucket_type", new_callable=mock.AsyncMock + ) as mock_get_type: + mock_get_type.return_value = BucketType.UNKNOWN + + # First lookup + type1 = await fs._lookup_bucket_type("my-bucket") + assert type1 == BucketType.UNKNOWN + assert mock_get_type.call_count == 1 + + # Second lookup should NOT call _get_bucket_type again because it's cached + type2 = await fs._lookup_bucket_type("my-bucket") + assert type2 == BucketType.UNKNOWN + assert mock_get_type.call_count == 1