From 2a557f895ec8778aa0b789e93baec52568a8effc Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 02:44:27 +0300 Subject: [PATCH 1/6] New version tag --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f46c9d1..22843b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dirstree" -version = "0.0.10" +version = "0.0.11" authors = [{ name = "Evgeniy Blinov", email = "zheni-b@yandex.ru" }] description = 'Another library for iterating through the contents of a directory' readme = "README.md" From 9d2b451eb1bf9114d509b1a66d9e34117efdc3f7 Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 04:06:57 +0300 Subject: [PATCH 2/6] Add the freeze flag --- dirstree/crawlers/crawler.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/dirstree/crawlers/crawler.py b/dirstree/crawlers/crawler.py index f62202a..81afcf3 100644 --- a/dirstree/crawlers/crawler.py +++ b/dirstree/crawlers/crawler.py @@ -31,7 +31,7 @@ class Crawler(AbstractCrawler): Only the first argument with the directory path is required, the rest are optional. """ - def __init__( + def __init__( # noqa: PLR0913 self, *paths: Union[str, Path], extensions: Optional[Collection[str]] = None, @@ -39,6 +39,7 @@ def __init__( filter: Optional[Callable[[Path], bool]] = None, # noqa: A002 token: AbstractToken = DefaultToken(), # noqa: B008 only_files: bool = True, + freeze: bool = False, ) -> None: if extensions is not None and not only_files: raise IncompatibleCrawlerOptionsError( @@ -61,6 +62,7 @@ def __init__( self.filter = filter self.token = token self.only_files = only_files + self.frozen = freeze self.addictional_repr_filters: Dict[str, Callable[[Any], bool]] = {} @@ -71,6 +73,7 @@ def __repr__(self) -> str: 'filter': not_none, 'token': lambda x: not isinstance(x, DefaultToken), 'only_files': lambda x: x is False, + 'freeze': lambda x: x is True, } filters.update(self.addictional_repr_filters) @@ -83,13 +86,12 @@ def __repr__(self) -> str: 'filter': self.filter, 'token': self.token, 'only_files': self.only_files, + 'freeze': self.frozen, }, filters=filters, # type: ignore[arg-type] ) - def go(self, token: AbstractToken = DefaultToken()) -> Generator[Path, None, None]: # noqa: B008 - token = token + self.token - + def _traverse(self, token: AbstractToken) -> Generator[Path, None, None]: excludes_spec = pathspec.PathSpec.from_lines('gitwildmatch', self.exclude) for path in self.paths: @@ -111,3 +113,15 @@ def go(self, token: AbstractToken = DefaultToken()) -> Generator[Path, None, Non break else: break + + def go(self, token: AbstractToken = DefaultToken()) -> Generator[Path, None, None]: # noqa: B008 + token = token + self.token + + if self.frozen: + snapshot = list(self._traverse(token)) + for path in snapshot: + if not token: + break + yield path + else: + yield from self._traverse(token) From 4342041825a30a01a5ebe736b7e58925755c0c05 Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 04:07:17 +0300 Subject: [PATCH 3/6] Add the freeze flag to the python crawler --- dirstree/crawlers/python_crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dirstree/crawlers/python_crawler.py b/dirstree/crawlers/python_crawler.py index d3b895e..3d1951b 100644 --- a/dirstree/crawlers/python_crawler.py +++ b/dirstree/crawlers/python_crawler.py @@ -13,9 +13,10 @@ def __init__( exclude: Optional[List[str]] = None, filter: Optional[Callable[[Path], bool]] = None, # noqa: A002 token: AbstractToken = DefaultToken(), # noqa: B008 + freeze: bool = False, ) -> None: super().__init__( - *paths, extensions=('.py',), exclude=exclude, filter=filter, token=token, + *paths, extensions=('.py',), exclude=exclude, filter=filter, token=token, freeze=freeze, ) self.addictional_repr_filters = { 'extensions': lambda x: False, # noqa: ARG005 From d4ea0ca34bc153ed3264a12d79304478791df885 Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 04:07:26 +0300 Subject: [PATCH 4/6] New tests --- tests/test_crawler.py | 1148 ++++++++++++++++++++++++++++++---- tests/test_python_crawler.py | 51 ++ 2 files changed, 1089 insertions(+), 110 deletions(-) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index d083687..b788cac 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,6 +1,7 @@ import errno import os import stat +import sys from functools import partial from inspect import Parameter, signature from pathlib import Path @@ -16,6 +17,7 @@ IncompatibleCrawlerOptionsError, PythonCrawler, ) +from dirstree.crawlers.group import CrawlersGroup INCOMPATIBLE_OPTIONS_MESSAGE = ( 'The "extensions" and "only_files=False" options are incompatible: ' @@ -28,7 +30,15 @@ def custom_filter(path: Path) -> bool: # noqa: ARG001 return True -def test_only_files_false_yields_files_and_directories(all_entities_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_only_files_false_yields_files_and_directories(all_entities_directory_path: Union[str, Path], freeze_kwargs): """ Crawling all entities should include both files and directories. @@ -37,7 +47,7 @@ def test_only_files_false_yields_files_and_directories(all_entities_directory_pa """ base_path = Path(all_entities_directory_path) - assert set(Crawler(all_entities_directory_path, only_files=False)) == { + assert set(Crawler(all_entities_directory_path, only_files=False, **freeze_kwargs)) == { base_path / '__init__.py', base_path / 'simple_code.py', base_path / '.hidden_file', @@ -72,7 +82,15 @@ def test_only_files_is_keyword_only(): assert signature(Crawler).parameters['only_files'].kind is Parameter.KEYWORD_ONLY -def test_only_files_false_yields_empty_directories(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_only_files_false_yields_empty_directories(tmp_path: Path, freeze_kwargs): """ Empty directories should be yielded when all filesystem entities are crawled. @@ -82,33 +100,57 @@ def test_only_files_false_yields_empty_directories(tmp_path: Path): empty_folder = tmp_path / 'empty_folder' empty_folder.mkdir() - assert empty_folder in set(Crawler(tmp_path, only_files=False)) + assert empty_folder in set(Crawler(tmp_path, only_files=False, **freeze_kwargs)) -def test_only_files_false_yields_hidden_paths(all_entities_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_only_files_false_yields_hidden_paths(all_entities_directory_path: Union[str, Path], freeze_kwargs): """ Hidden paths should not be filtered out by the all-entity mode. The test uses fixture entries whose names start with a dot and verifies that both the hidden file and hidden directory are yielded. """ - paths = set(Crawler(all_entities_directory_path, only_files=False)) + paths = set(Crawler(all_entities_directory_path, only_files=False, **freeze_kwargs)) assert Path(all_entities_directory_path) / '.hidden_file' in paths assert Path(all_entities_directory_path) / '.hidden_folder' in paths -def test_only_files_false_does_not_yield_base_path(all_entities_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_only_files_false_does_not_yield_base_path(all_entities_directory_path: Union[str, Path], freeze_kwargs): """ Crawling all entities should not yield the base path itself. The test verifies that the root passed to the crawler is absent from the yielded paths. """ - assert Path(all_entities_directory_path) not in set(Crawler(all_entities_directory_path, only_files=False)) + assert Path(all_entities_directory_path) not in set(Crawler(all_entities_directory_path, only_files=False, **freeze_kwargs)) -def test_default_mode_stays_file_only(all_entities_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_default_mode_stays_file_only(all_entities_directory_path: Union[str, Path], freeze_kwargs): """ The default crawler mode should remain file-only. @@ -125,43 +167,75 @@ def test_default_mode_stays_file_only(all_entities_directory_path: Union[str, Pa base_path / 'nested_folder' / 'non_python_file.txt', base_path / 'nested_folder' / 'python_file.py', } - real_paths = set(Crawler(all_entities_directory_path)) + real_paths = set(Crawler(all_entities_directory_path, **freeze_kwargs)) assert real_paths == expected_paths assert all(path.is_file() for path in real_paths) -def test_zero_paths_with_only_files_false_returns_empty_list(): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_zero_paths_with_only_files_false_returns_empty_list(freeze_kwargs): """ A crawler without base paths should be empty in all-entity mode. The test constructs a zero-path crawler with `only_files=False` and verifies that iteration returns an empty list. """ - assert list(Crawler(only_files=False)) == [] + assert list(Crawler(only_files=False, **freeze_kwargs)) == [] -def test_empty_base_directory_with_only_files_false_returns_empty_list(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_empty_base_directory_with_only_files_false_returns_empty_list(tmp_path: Path, freeze_kwargs): """ An empty base directory should not yield itself. The test crawls an empty temporary directory with `only_files=False` and expects no child paths. """ - assert list(Crawler(tmp_path, only_files=False)) == [] + assert list(Crawler(tmp_path, only_files=False, **freeze_kwargs)) == [] -def test_nonexistent_base_path_with_only_files_false_returns_empty_list(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_nonexistent_base_path_with_only_files_false_returns_empty_list(tmp_path: Path, freeze_kwargs): """ A nonexistent base path should yield no results in all-entity mode. The test points the crawler at a missing path and verifies that no entries are yielded in all-entity mode. """ - assert list(Crawler(tmp_path / 'missing', only_files=False)) == [] + assert list(Crawler(tmp_path / 'missing', only_files=False, **freeze_kwargs)) == [] -def test_file_base_path_with_only_files_false_returns_empty_list(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_file_base_path_with_only_files_false_returns_empty_list(tmp_path: Path, freeze_kwargs): """ A file used as the base path should not be yielded as its own child. @@ -171,7 +245,7 @@ def test_file_base_path_with_only_files_false_returns_empty_list(tmp_path: Path) file_path = tmp_path / 'file.py' file_path.write_text('content') - assert list(Crawler(file_path, only_files=False)) == [] + assert list(Crawler(file_path, only_files=False, **freeze_kwargs)) == [] @pytest.mark.parametrize( @@ -193,6 +267,7 @@ def test_exclude_directory_pattern_excludes_directory_and_children( remain visible. """ base_path = Path(all_entities_directory_path) + paths = set(Crawler(all_entities_directory_path, only_files=False, exclude=exclude)) assert base_path / 'nested_folder' not in paths @@ -234,6 +309,7 @@ def test_filter_false_for_directory_does_not_prune_children(all_entities_directo that directory can still be yielded. """ base_path = Path(all_entities_directory_path) + paths = set( Crawler( all_entities_directory_path, @@ -266,6 +342,7 @@ def condition() -> bool: return index == n token = ConditionToken(condition) + crawler = Crawler(all_entities_directory_path, only_files=False, token=token, filter=count) assert list(crawler) == list(Crawler(all_entities_directory_path, only_files=False))[:n] @@ -278,10 +355,11 @@ def test_multiple_base_paths_with_only_files_false_are_not_deduplicated(all_enti The test crawls the same base path twice in one crawler and compares sorted string paths with two copies of a single-base traversal. """ + expected_paths = sorted(str(path) for path in list(Crawler(all_entities_directory_path, only_files=False)) * 2) + real_paths = sorted( str(path) for path in Crawler(all_entities_directory_path, all_entities_directory_path, only_files=False) ) - expected_paths = sorted(str(path) for path in list(Crawler(all_entities_directory_path, only_files=False)) * 2) assert real_paths == expected_paths @@ -326,6 +404,10 @@ def test_group_with_only_files_false_deduplicates_paths(all_entities_directory_p assert list(group) == list(Crawler(all_entities_directory_path, only_files=False)) +@pytest.mark.skipif( + sys.platform == 'win32', + reason='Symlink creation requires administrator privileges on Windows.', +) def test_group_deduplicates_by_path_without_resolving(tmp_path: Path): """ Group deduplication should preserve distinct paths to the same target. @@ -339,11 +421,7 @@ def test_group_deduplicates_by_path_without_resolving(tmp_path: Path): link_file = link_directory / 'file.txt' real_directory.mkdir() real_file.write_text('content') - - try: - link_directory.symlink_to(real_directory, target_is_directory=True) - except (NotImplementedError, OSError) as e: - pytest.skip(f'Symlinks are not supported here: {e}') + link_directory.symlink_to(real_directory, target_is_directory=True) paths = list(Crawler(real_directory, only_files=False) + Crawler(link_directory, only_files=False)) @@ -419,6 +497,10 @@ def test_incompatible_options_error_message_mentions_both_options(all_entities_d assert 'non-file filesystem entities' in str(error.value) +@pytest.mark.skipif( + sys.platform == 'win32', + reason='Symlink creation requires administrator privileges on Windows.', +) def test_only_files_false_yields_symlink_nodes_when_supported(tmp_path: Path): """ All-entity mode should yield symlink entries under the base path. @@ -433,13 +515,9 @@ def test_only_files_false_yields_symlink_nodes_when_supported(tmp_path: Path): broken_link = tmp_path / 'broken_link' target_file.write_text('target') target_directory.mkdir() - - try: - file_link.symlink_to(target_file) - directory_link.symlink_to(target_directory, target_is_directory=True) - broken_link.symlink_to(tmp_path / 'missing') - except (NotImplementedError, OSError) as e: - pytest.skip(f'Symlinks are not supported here: {e}') + file_link.symlink_to(target_file) + directory_link.symlink_to(target_directory, target_is_directory=True) + broken_link.symlink_to(tmp_path / 'missing') paths = set(Crawler(tmp_path, only_files=False)) @@ -448,13 +526,24 @@ def test_only_files_false_yields_symlink_nodes_when_supported(tmp_path: Path): assert broken_link in paths -def test_rglob_errors_propagate_with_only_files_false(tmp_path: Path): +@pytest.mark.skipif( + sys.platform == 'win32' or sys.version_info >= (3, 13), + reason='Path.rglob does not raise PermissionError for chmod(0) directories on Windows, and on Python 3.13+ pathlib silently skips inaccessible entries.', +) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_rglob_errors_propagate_with_only_files_false(tmp_path: Path, freeze_kwargs): """ Traversal errors from `Path.rglob` should not be swallowed. - The test creates an unreadable directory and first checks whether this - platform exposes that as a `PermissionError`. If it does, the crawler must - propagate the same error; otherwise the test is skipped. + The test creates an unreadable directory and verifies that the crawler + propagates the same `PermissionError` that `Path.rglob` would surface. """ blocked = tmp_path / 'blocked' blocked.mkdir() @@ -462,24 +551,70 @@ def test_rglob_errors_propagate_with_only_files_false(tmp_path: Path): blocked.chmod(0) try: - try: - list(tmp_path.rglob('*')) - except PermissionError: - pass - else: - pytest.skip('Path.rglob does not propagate permission errors on this platform.') - with pytest.raises( PermissionError, match=match(str(PermissionError(errno.EACCES, os.strerror(errno.EACCES), str(blocked)))), ): - list(Crawler(tmp_path, only_files=False)) + list(Crawler(tmp_path, only_files=False, **freeze_kwargs)) + finally: + blocked.chmod(stat.S_IRWXU) + + +@pytest.mark.skipif( + sys.platform == 'win32' or sys.version_info < (3, 13), + reason='pathlib silently swallows OSError during traversal only on POSIX Python 3.13+.', +) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_unreadable_subdirectory_is_silently_skipped_on_python_3_13_plus_posix(tmp_path: Path, freeze_kwargs): + """ + On POSIX Python 3.13+, `Path.rglob` deliberately swallows `OSError` (and + its `PermissionError` subclass) for inaccessible entries to match + `glob.glob` behaviour. The crawler must transparently inherit that + contract: an unreadable subdirectory does not raise — it just contributes + nothing to the result, while the rest of the tree iterates as usual. + + Mirror of `test_rglob_errors_propagate_with_only_files_false`, which is + skipped on this same combination of platform and Python version because + the propagation invariant simply does not apply there. Together the two + tests pin down `Crawler`'s observable behaviour around `OSError` from + `rglob` across the whole CI matrix. + """ + visible = tmp_path / 'visible.txt' + visible.write_text('content') + blocked = tmp_path / 'blocked' + blocked.mkdir() + hidden = blocked / 'hidden.txt' + hidden.write_text('content') + blocked.chmod(0) + + try: + paths = set(Crawler(tmp_path, only_files=False, **freeze_kwargs)) finally: blocked.chmod(stat.S_IRWXU) + assert visible in paths + assert blocked in paths + assert hidden not in paths + +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) def test_crawl_test_directory_with_default_extensions( crawl_directory_path: Union[str, Path], + freeze_kwargs, ): """ The default crawler should return every file from the fixture tree. @@ -487,7 +622,7 @@ def test_crawl_test_directory_with_default_extensions( The test compares sorted string paths with the full expected file list, including files in the nested directory. """ - crawler = Crawler(crawl_directory_path) + crawler = Crawler(crawl_directory_path, **freeze_kwargs) expected_paths = [ os.path.join('tests', 'test_files', 'walk_it', '__init__.py'), @@ -500,6 +635,7 @@ def test_crawl_test_directory_with_default_extensions( ), os.path.join('tests', 'test_files', 'walk_it', 'nested_folder', '__init__.py'), ] + real_paths = [str(x) for x in crawler] expected_paths.sort() @@ -526,14 +662,22 @@ def test_crawl_test_directory_with_txt_extension( ] -def test_crawl_test_directory_with_py_extension(crawl_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_crawl_test_directory_with_py_extension(crawl_directory_path: Union[str, Path], freeze_kwargs): """ Python extension filtering should keep only `.py` files. The test compares sorted paths from `extensions=['.py']` with the expected Python files in the fixture tree. """ - crawler = Crawler(crawl_directory_path, extensions=['.py']) + crawler = Crawler(crawl_directory_path, extensions=['.py'], **freeze_kwargs) expected_paths = [ os.path.join('tests', 'test_files', 'walk_it', '__init__.py'), @@ -543,6 +687,7 @@ def test_crawl_test_directory_with_py_extension(crawl_directory_path: Union[str, ), os.path.join('tests', 'test_files', 'walk_it', 'nested_folder', '__init__.py'), ] + real_paths = [str(x) for x in crawler] expected_paths.sort() @@ -570,8 +715,17 @@ def test_crawl_test_directory_with_exclude_with_py_extension( ] +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) def test_crawl_test_directory_with_exclude_patterns_without_extensions( crawl_directory_path: Union[str, Path], + freeze_kwargs, ): """ Exclude patterns should apply when no extension filter is configured. @@ -579,7 +733,7 @@ def test_crawl_test_directory_with_exclude_patterns_without_extensions( The test excludes `__init__.py` and verifies that all other fixture files remain in the result. """ - crawler = Crawler(crawl_directory_path, exclude=['__init__.py']) + crawler = Crawler(crawl_directory_path, exclude=['__init__.py'], **freeze_kwargs) expected_paths = [ os.path.join('tests', 'test_files', 'walk_it', 'simple_code.py'), @@ -590,6 +744,7 @@ def test_crawl_test_directory_with_exclude_patterns_without_extensions( 'tests', 'test_files', 'walk_it', 'nested_folder', 'python_file.py', ), ] + real_paths = [str(x) for x in crawler] expected_paths.sort() @@ -598,8 +753,17 @@ def test_crawl_test_directory_with_exclude_patterns_without_extensions( assert real_paths == expected_paths +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) def test_crawl_test_directory_with_exclude_patterns_and_extensions( crawl_directory_path: Union[str, Path], + freeze_kwargs, ): """ Exclude patterns should compose with non-Python extension filtering. @@ -608,7 +772,7 @@ def test_crawl_test_directory_with_exclude_patterns_and_extensions( and verifies that the text file is still yielded. """ crawler = Crawler( - crawl_directory_path, extensions=['.txt'], exclude=['__init__.py'], + crawl_directory_path, extensions=['.txt'], exclude=['__init__.py'], **freeze_kwargs, ) assert [str(x) for x in crawler] == [ @@ -618,31 +782,44 @@ def test_crawl_test_directory_with_exclude_patterns_and_extensions( ] -@pytest.mark.parametrize( - ('crawler', 'expected_repr'), - [ - (Crawler('.'), "Crawler('.')"), - (Crawler('usr/bin'), "Crawler('usr/bin')"), - (Crawler('.', extensions=['.py']), "Crawler('.', extensions=['.py'])"), - (Crawler('.', exclude=['*.py'], extensions=['.py']), "Crawler('.', extensions=['.py'], exclude=['*.py'])"), - (Crawler('.', exclude=['*.py']), "Crawler('.', exclude=['*.py'])"), - (Crawler('.', filter=custom_filter), "Crawler('.', filter=custom_filter)"), - (Crawler('.', filter=lambda x: True), "Crawler('.', filter=lambda x: True)"), # noqa: ARG005 - (Crawler('.', token=ConditionToken(lambda: True)), "Crawler('.', token=ConditionToken(λ))"), - (Crawler('../dirstree') + Crawler('../cantok'), "CrawlersGroup([Crawler('../dirstree'), Crawler('../cantok')])"), - (Crawler('../dirstree') + PythonCrawler('../cantok'), "CrawlersGroup([Crawler('../dirstree'), PythonCrawler('../cantok')])"), - ], -) -def test_repr(crawler: Crawler, expected_repr: str): +def test_repr(): """ Crawler and group representations should include configured options. - The parametrized cases cover plain crawlers, filters, tokens, excludes, - extensions, and mixed crawler groups by checking exact `repr()` output. - """ - assert repr(crawler) == expected_repr + The assertions cover plain crawlers, filters, tokens, excludes, extensions, + mixed crawler groups, and every `freeze` combination (alone, with each + other option, hidden when explicitly `False`, and shown last after + `only_files`). For `freeze=True` we also verify that `PythonCrawler` keeps + its hardcoded `extensions` hidden while still exposing the new field. + """ + assert repr(Crawler('.')) == "Crawler('.')" + assert repr(Crawler('usr/bin')) == "Crawler('usr/bin')" + assert repr(Crawler('.', extensions=['.py'])) == "Crawler('.', extensions=['.py'])" + assert repr(Crawler('.', exclude=['*.py'], extensions=['.py'])) == "Crawler('.', extensions=['.py'], exclude=['*.py'])" + assert repr(Crawler('.', exclude=['*.py'])) == "Crawler('.', exclude=['*.py'])" + assert repr(Crawler('.', filter=custom_filter)) == "Crawler('.', filter=custom_filter)" + assert repr(Crawler('.', filter=lambda x: True)) == "Crawler('.', filter=lambda x: True)" # noqa: ARG005 + assert repr(Crawler('.', token=ConditionToken(lambda: True))) == "Crawler('.', token=ConditionToken(λ))" + assert repr(Crawler('../dirstree') + Crawler('../cantok')) == "CrawlersGroup([Crawler('../dirstree'), Crawler('../cantok')])" + assert repr(Crawler('../dirstree') + PythonCrawler('../cantok')) == "CrawlersGroup([Crawler('../dirstree'), PythonCrawler('../cantok')])" + + assert repr(Crawler('.', freeze=True)) == "Crawler('.', freeze=True)" + assert repr(Crawler('.', freeze=False)) == "Crawler('.')" + assert repr(Crawler('.', extensions=['.py'], freeze=True)) == "Crawler('.', extensions=['.py'], freeze=True)" + assert repr(Crawler('.', only_files=False, freeze=True)) == "Crawler('.', only_files=False, freeze=True)" + assert repr(Crawler('.', filter=custom_filter, freeze=True)) == "Crawler('.', filter=custom_filter, freeze=True)" + assert repr(Crawler('.', token=ConditionToken(lambda: True), freeze=True)) == "Crawler('.', token=ConditionToken(λ), freeze=True)" + assert repr(PythonCrawler('.', freeze=True)) == "PythonCrawler('.', freeze=True)" +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -650,18 +827,26 @@ def test_repr(crawler: Crawler, expected_repr: str): PythonCrawler, ], ) -def test_iter(factory: Type[Crawler]): +def test_iter(factory: Type[Crawler], freeze_kwargs): """ Iterating a crawler should delegate to `go()`. The test runs both crawler classes and compares `list(crawler)` with `list(crawler.go())`. """ - crawler = factory('.') + crawler = factory('.', **freeze_kwargs) assert list(crawler) == list(crawler.go()) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -669,18 +854,26 @@ def test_iter(factory: Type[Crawler]): PythonCrawler, ], ) -def test_crawl_repeat(factory: Type[Crawler]): +def test_crawl_repeat(factory: Type[Crawler], freeze_kwargs): """ Crawlers should be reusable across repeated iterations. The test materializes the same crawler twice and verifies that both iterations produce identical results. """ - crawler = factory('.') + crawler = factory('.', **freeze_kwargs) assert list(crawler) == list(crawler) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -688,7 +881,7 @@ def test_crawl_repeat(factory: Type[Crawler]): PythonCrawler, ], ) -def test_filter_skips_first_path(factory: Type[Crawler]): +def test_filter_skips_first_path(factory: Type[Crawler], freeze_kwargs): """ A false filter result should hide exactly the matching path. @@ -707,9 +900,17 @@ def empty_filter(path) -> bool: # noqa: ARG001 return result - assert list(factory('.'))[1:] == list(factory('.', filter=empty_filter)) + assert list(factory('.', **freeze_kwargs))[1:] == list(factory('.', filter=empty_filter, **freeze_kwargs)) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -717,7 +918,7 @@ def empty_filter(path) -> bool: # noqa: ARG001 PythonCrawler, ], ) -def test_argument_of_filter_is_path_object(crawl_directory_path: Union[str, Path], factory: Type[Crawler]): +def test_argument_of_filter_is_path_object(crawl_directory_path: Union[str, Path], factory: Type[Crawler], freeze_kwargs): """ Filters should receive the same `Path` objects that traversal yields. @@ -730,7 +931,7 @@ def empty_filter(path): collector.append(path) return True - crawler = factory(crawl_directory_path, filter=empty_filter) + crawler = factory(crawl_directory_path, filter=empty_filter, **freeze_kwargs) assert list(crawler) == collector @@ -776,6 +977,14 @@ def condition() -> bool: assert list(factory(crawl_directory_path))[:n] == list(crawler) +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -783,16 +992,24 @@ def condition() -> bool: PythonCrawler, ], ) -def test_cancelled_token(crawl_directory_path: Union[str, Path], factory: Type[Crawler]): +def test_cancelled_token(crawl_directory_path: Union[str, Path], factory: Type[Crawler], freeze_kwargs): """ An already-cancelled token should suppress traversal. The test passes a cancelled token to both crawler classes and expects an empty result. """ - assert list(factory(crawl_directory_path, token=SimpleToken(cancelled=True))) == [] + assert list(factory(crawl_directory_path, token=SimpleToken(cancelled=True), **freeze_kwargs)) == [] +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) @pytest.mark.parametrize( 'factory', [ @@ -800,15 +1017,15 @@ def test_cancelled_token(crawl_directory_path: Union[str, Path], factory: Type[C PythonCrawler, ], ) -def test_default_token(crawl_directory_path: Union[str, Path], factory: Type[Crawler]): +def test_default_token(crawl_directory_path: Union[str, Path], factory: Type[Crawler], freeze_kwargs): """ An explicit default token should behave like the implicit default. The test compares traversal with `DefaultToken()` to normal traversal for both crawler classes. """ - assert list(factory(crawl_directory_path, token=DefaultToken())) == list( - factory(crawl_directory_path), + assert list(factory(crawl_directory_path, token=DefaultToken(), **freeze_kwargs)) == list( + factory(crawl_directory_path, **freeze_kwargs), ) @@ -902,14 +1119,22 @@ def test_addition_with_non_crawler_raises_type_error(): Crawler('.') + 'kek' -def test_crawl_two_folders(crawl_directory_path: Union[str, Path], second_crawl_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_crawl_two_folders(crawl_directory_path: Union[str, Path], second_crawl_directory_path: Union[str, Path], freeze_kwargs): """ A multipath crawler should traverse base paths in argument order. The test compares one crawler with two base paths to the concatenation of two single-path crawler results. """ - assert list(Crawler(crawl_directory_path, second_crawl_directory_path)) == list(Crawler(crawl_directory_path)) + list(Crawler(second_crawl_directory_path)) + assert list(Crawler(crawl_directory_path, second_crawl_directory_path, **freeze_kwargs)) == list(Crawler(crawl_directory_path, **freeze_kwargs)) + list(Crawler(second_crawl_directory_path, **freeze_kwargs)) def test_crawl_without_path(): @@ -940,7 +1165,9 @@ def test_apply_calls_function_once_per_file(crawl_directory_path: Union[str, Pat iteration length. """ seen: list = [] + Crawler(crawl_directory_path).apply(seen.append) + assert len(seen) == len(list(Crawler(crawl_directory_path))) @@ -952,7 +1179,9 @@ def test_apply_passes_path_instance(crawl_directory_path: Union[str, Path]): requires all callback arguments to be paths. """ types_seen: list = [] + Crawler(crawl_directory_path).apply(lambda p: types_seen.append(isinstance(p, Path))) + assert types_seen assert all(types_seen) @@ -965,7 +1194,9 @@ def test_apply_set_matches_iteration_set(crawl_directory_path: Union[str, Path]) iteration set. """ seen: list = [] + Crawler(crawl_directory_path).apply(seen.append) + assert set(seen) == set(Crawler(crawl_directory_path)) @@ -977,7 +1208,9 @@ def test_apply_order_matches_iteration(crawl_directory_path: Union[str, Path]): crawler iteration. """ seen: list = [] + Crawler(crawl_directory_path).apply(seen.append) + assert seen == list(Crawler(crawl_directory_path)) @@ -1001,13 +1234,23 @@ def test_apply_multiple_invocations_independent(crawl_directory_path: Union[str, crawler = Crawler(crawl_directory_path) first: list = [] second: list = [] + crawler.apply(first.append) crawler.apply(second.append) + assert first == second assert first == list(crawler) -def test_apply_on_empty_directory(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_on_empty_directory(tmp_path: Path, freeze_kwargs): """ `apply()` should not call the callback for an empty directory. @@ -1015,7 +1258,9 @@ def test_apply_on_empty_directory(tmp_path: Path): remains empty. """ seen: list = [] - Crawler(tmp_path).apply(seen.append) + + Crawler(tmp_path, **freeze_kwargs).apply(seen.append) + assert seen == [] @@ -1027,7 +1272,9 @@ def test_apply_respects_extensions(crawl_directory_path: Union[str, Path]): every visited path has a Python suffix. """ seen: list = [] + Crawler(crawl_directory_path, extensions=['.py']).apply(seen.append) + assert seen assert all(p.suffix == '.py' for p in seen) @@ -1040,7 +1287,9 @@ def test_apply_respects_exclude(crawl_directory_path: Union[str, Path]): none of the visited paths have that file name. """ seen: list = [] + Crawler(crawl_directory_path, exclude=['__init__.py']).apply(seen.append) + assert seen assert all(p.name != '__init__.py' for p in seen) @@ -1053,7 +1302,9 @@ def test_apply_respects_custom_filter(crawl_directory_path: Union[str, Path]): verifies that every visited path matches that predicate. """ seen: list = [] + Crawler(crawl_directory_path, filter=lambda x: x.suffix == '.py').apply(seen.append) + assert seen assert all(p.suffix == '.py' for p in seen) @@ -1071,7 +1322,9 @@ def test_apply_respects_all_filters_combined(crawl_directory_path: Union[str, Pa exclude=['__init__.py'], filter=lambda x: 'simple' in x.name, ) + Crawler(crawl_directory_path, **kwargs).apply(seen.append) # type: ignore[arg-type] + assert seen == list(Crawler(crawl_directory_path, **kwargs)) # type: ignore[arg-type] @@ -1083,7 +1336,9 @@ def test_apply_with_cancelled_call_time_token_skips_callback(crawl_directory_pat path reaches the callback. """ seen: list = [] + Crawler(crawl_directory_path).apply(seen.append, token=SimpleToken(cancelled=True)) + assert seen == [] @@ -1095,7 +1350,9 @@ def test_apply_with_cancelled_instance_token_skips_callback(crawl_directory_path `apply()` never calls the callback. """ seen: list = [] + Crawler(crawl_directory_path, token=SimpleToken(cancelled=True)).apply(seen.append) + assert seen == [] @@ -1108,6 +1365,7 @@ def test_apply_with_condition_token_cancels_after_n(crawl_directory_path: Union[ and the visited paths are compared with the first `n` iteration results. """ seen: list = [] + index = 0 def condition() -> bool: @@ -1119,6 +1377,7 @@ def callback(path: Path) -> None: index += 1 Crawler(crawl_directory_path).apply(callback, token=ConditionToken(condition)) + assert seen == list(Crawler(crawl_directory_path))[:n] @@ -1142,8 +1401,10 @@ def test_apply_combines_instance_and_call_tokens( already cancelled, and all of them should skip callbacks. """ seen: list = [] + crawler = Crawler(crawl_directory_path, token=SimpleToken(cancelled=instance_cancelled)) crawler.apply(seen.append, token=SimpleToken(cancelled=apply_cancelled)) + assert seen == [] @@ -1155,7 +1416,9 @@ def test_apply_default_token_walks_everything(crawl_directory_path: Union[str, P receives the same paths as normal iteration. """ seen: list = [] + Crawler(crawl_directory_path).apply(seen.append, token=DefaultToken()) + assert seen == list(Crawler(crawl_directory_path)) @@ -1167,6 +1430,7 @@ def test_apply_token_check_granularity_is_between_yields(crawl_directory_path: U that flag before the next callback, so exactly one path is visited. """ seen: list = [] + cancelled_flag = False def condition() -> bool: @@ -1178,10 +1442,19 @@ def callback(path: Path) -> None: cancelled_flag = True Crawler(crawl_directory_path).apply(callback, token=ConditionToken(condition)) + assert len(seen) == 1 -def test_apply_with_zero_arg_callable_raises(crawl_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_with_zero_arg_callable_raises(crawl_directory_path: Union[str, Path], freeze_kwargs): """ `apply()` should reject callbacks without a path argument. @@ -1196,12 +1469,20 @@ def collect_filter(path: Path) -> bool: return True with pytest.raises(SignatureMismatchError, match=match('The signature of the callable object does not match the expected one.')): - Crawler(crawl_directory_path, filter=collect_filter).apply(lambda: None) # type: ignore[misc, arg-type] + Crawler(crawl_directory_path, filter=collect_filter, **freeze_kwargs).apply(lambda: None) # type: ignore[misc, arg-type] assert filter_calls == [] -def test_apply_with_two_arg_callable_raises(crawl_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_with_two_arg_callable_raises(crawl_directory_path: Union[str, Path], freeze_kwargs): """ `apply()` should reject callbacks requiring too many positional arguments. @@ -1214,7 +1495,7 @@ def test_apply_with_two_arg_callable_raises(crawl_directory_path: Union[str, Pat 'This is a difficult situation, there is no guarantee that a call with a variable number of positional arguments will fill all the slots of positional arguments.', ), ): - Crawler(crawl_directory_path).apply(lambda x, y: None) # type: ignore[misc, arg-type] # noqa: ARG005 + Crawler(crawl_directory_path, **freeze_kwargs).apply(lambda x, y: None) # type: ignore[misc, arg-type] # noqa: ARG005 def test_apply_with_def_function_works(crawl_directory_path: Union[str, Path]): @@ -1230,6 +1511,7 @@ def callback(path: Path) -> None: seen.append(path) Crawler(crawl_directory_path).apply(callback) + assert seen == list(Crawler(crawl_directory_path)) @@ -1241,6 +1523,7 @@ def test_apply_validation_runs_at_apply_not_construction(crawl_directory_path: U to `apply()` and expects the signature error there. """ crawler = Crawler(crawl_directory_path) + with pytest.raises(SignatureMismatchError, match=match('The signature of the callable object does not match the expected one.')): crawler.apply(lambda: None) # type: ignore[misc, arg-type] @@ -1260,7 +1543,9 @@ def __call__(self, path: Path) -> None: self.seen.append(path) recorder = Recorder() + Crawler(crawl_directory_path).apply(recorder) + assert recorder.seen == list(Crawler(crawl_directory_path)) @@ -1277,7 +1562,9 @@ def cb(prefix: str, path: Path) -> None: seen.append((prefix, path)) Crawler(crawl_directory_path).apply(partial(cb, 'hit')) + expected = list(Crawler(crawl_directory_path)) + assert seen == [('hit', p) for p in expected] @@ -1296,7 +1583,9 @@ def cb(self, path: Path) -> None: self.seen.append(path) c = Collector() + Crawler(crawl_directory_path).apply(c.cb) + assert c.seen == list(Crawler(crawl_directory_path)) @@ -1315,6 +1604,7 @@ def gen(path: Path): yield path Crawler(crawl_directory_path).apply(gen) + assert counter == [] @@ -1332,7 +1622,15 @@ def boom(path: Path) -> None: # noqa: ARG001 Crawler(crawl_directory_path).apply(boom) -def test_apply_stops_iteration_on_first_exception(crawl_directory_path: Union[str, Path]): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_stops_iteration_on_first_exception(crawl_directory_path: Union[str, Path], freeze_kwargs): """ `apply()` should stop immediately when the callback raises. @@ -1348,7 +1646,8 @@ def callback(path: Path) -> None: # noqa: ARG001 raise ValueError('stop here') with pytest.raises(ValueError, match=match('stop here')): - Crawler(crawl_directory_path).apply(callback) + Crawler(crawl_directory_path, **freeze_kwargs).apply(callback) + assert counter == 3 @@ -1380,8 +1679,10 @@ def test_apply_on_group_visits_paths_from_both( compares the visited set with the union of both child traversals. """ seen: list = [] + group = Crawler(crawl_directory_path) + Crawler(second_crawl_directory_path) group.apply(seen.append) + assert set(seen) == set(Crawler(crawl_directory_path)) | set(Crawler(second_crawl_directory_path)) @@ -1393,8 +1694,10 @@ def test_apply_on_group_deduplicates(crawl_directory_path: Union[str, Path]): each yielded path once, matching a single crawler traversal. """ seen: list = [] + group = Crawler(crawl_directory_path) + Crawler(crawl_directory_path) group.apply(seen.append) + assert seen == list(Crawler(crawl_directory_path)) @@ -1406,8 +1709,10 @@ def test_apply_on_nested_group_deduplicates(crawl_directory_path: Union[str, Pat callback still sees only the single-crawler traversal. """ seen: list = [] + group = Crawler(crawl_directory_path) + (Crawler(crawl_directory_path) + Crawler(crawl_directory_path)) group.apply(seen.append) + assert seen == list(Crawler(crawl_directory_path)) @@ -1422,8 +1727,10 @@ def test_apply_on_group_with_cancelled_token( child crawler invokes the callback. """ seen: list = [] + group = Crawler(crawl_directory_path) + Crawler(second_crawl_directory_path) group.apply(seen.append, token=SimpleToken(cancelled=True)) + assert seen == [] @@ -1438,9 +1745,11 @@ def test_apply_on_group_respects_child_tokens( verifies that only the live crawler contributes callback inputs. """ seen: list = [] + live = Crawler(crawl_directory_path) dead = Crawler(second_crawl_directory_path, token=SimpleToken(cancelled=True)) (live + dead).apply(seen.append) + assert set(seen) == set(live) @@ -1452,10 +1761,13 @@ def test_apply_with_multipath_crawler_no_dedup(crawl_directory_path: Union[str, stringified paths so the assertion focuses on duplicate membership. """ seen: list = [] + Crawler(crawl_directory_path, crawl_directory_path).apply(seen.append) + expected = list(Crawler(crawl_directory_path)) * 2 seen_sorted = sorted(str(p) for p in seen) expected_sorted = sorted(str(p) for p in expected) + assert seen_sorted == expected_sorted @@ -1470,7 +1782,15 @@ def test_apply_with_none_raises_valueerror(crawl_directory_path: Union[str, Path Crawler(crawl_directory_path).apply(None) # type: ignore[arg-type] -def test_apply_on_zero_path_crawler_never_calls_callback(): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_on_zero_path_crawler_never_calls_callback(freeze_kwargs): """ `apply()` on a zero-path crawler should be a no-op. @@ -1478,11 +1798,21 @@ def test_apply_on_zero_path_crawler_never_calls_callback(): callback input is recorded. """ seen: list = [] - Crawler().apply(seen.append) + + Crawler(**freeze_kwargs).apply(seen.append) + assert seen == [] -def test_apply_on_nonexistent_base_path_matches_iteration_behavior(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_on_nonexistent_base_path_matches_iteration_behavior(tmp_path: Path, freeze_kwargs): """ `apply()` should match iteration behavior for nonexistent base paths. @@ -1491,34 +1821,32 @@ def test_apply_on_nonexistent_base_path_matches_iteration_behavior(tmp_path: Pat """ nonexistent = tmp_path / 'does_not_exist' - iter_paths = list(Crawler(nonexistent)) + iter_paths = list(Crawler(nonexistent, **freeze_kwargs)) seen: list = [] - Crawler(nonexistent).apply(seen.append) + + Crawler(nonexistent, **freeze_kwargs).apply(seen.append) + assert seen == iter_paths +@pytest.mark.skipif( + sys.platform == 'win32' or sys.version_info >= (3, 13), + reason='Path.rglob does not raise PermissionError for chmod(0) directories on Windows, and on Python 3.13+ pathlib silently skips inaccessible entries.', +) def test_apply_propagates_rglob_errors_with_only_files_false(tmp_path: Path): """ `apply()` should propagate traversal errors from all-entity crawling. - The test creates an unreadable directory and first checks whether this - platform exposes that as a `PermissionError`. If it does, applying a - callback through the crawler must propagate the same error. + The test creates an unreadable directory and verifies that `apply()` via the + crawler surfaces the same `PermissionError` that direct `rglob` would raise. """ blocked = tmp_path / 'blocked' blocked.mkdir() (blocked / 'file.txt').write_text('content') blocked.chmod(0) + seen: list = [] try: - try: - list(tmp_path.rglob('*')) - except PermissionError: - pass - else: - pytest.skip('Path.rglob does not propagate permission errors on this platform.') - - seen: list = [] with pytest.raises( PermissionError, match=match(str(PermissionError(errno.EACCES, os.strerror(errno.EACCES), str(blocked)))), @@ -1528,7 +1856,15 @@ def test_apply_propagates_rglob_errors_with_only_files_false(tmp_path: Path): blocked.chmod(stat.S_IRWXU) -def test_apply_on_file_base_path_matches_iteration_behavior(tmp_path: Path): +@pytest.mark.parametrize( + 'freeze_kwargs', + [ + {}, + {'freeze': False}, + {'freeze': True}, + ], +) +def test_apply_on_file_base_path_matches_iteration_behavior(tmp_path: Path, freeze_kwargs): """ `apply()` should match iteration when the base path is a file. @@ -1538,7 +1874,599 @@ def test_apply_on_file_base_path_matches_iteration_behavior(tmp_path: Path): file_path = tmp_path / 'a_file.txt' file_path.write_text('hi') - iter_paths = list(Crawler(file_path)) + iter_paths = list(Crawler(file_path, **freeze_kwargs)) seen: list = [] - Crawler(file_path).apply(seen.append) + + Crawler(file_path, **freeze_kwargs).apply(seen.append) + assert seen == iter_paths + + +@pytest.mark.parametrize( + 'factory', + [ + Crawler, + PythonCrawler, + ], +) +def test_freeze_default_is_false(factory: Type[Crawler]): + """ + The new `freeze` option should default to `False` for every crawler class. + + The test guards backward compatibility: code that does not mention `freeze` + must observe `crawler.frozen is False`. The internal attribute is named + `frozen` (state form) while the external parameter is the verb `freeze`. + """ + assert factory('.').frozen is False + + +@pytest.mark.parametrize( + 'factory', + [ + Crawler, + PythonCrawler, + ], +) +def test_freeze_is_keyword_only(factory: Type[Crawler]): + """ + The new `freeze` option should refuse positional arguments. + + The test inspects the public constructor signature and verifies that the + parameter is keyword-only for both crawler classes. + """ + assert signature(factory).parameters['freeze'].kind is Parameter.KEYWORD_ONLY + + +@pytest.mark.parametrize( + 'factory', + [ + Crawler, + PythonCrawler, + ], +) +@pytest.mark.parametrize( + 'value', + [ + True, + False, + ], +) +def test_freeze_value_is_stored_on_instance(factory: Type[Crawler], value: bool): + """ + An explicit `freeze=value` should be reflected by the `.frozen` attribute. + + The test runs over both crawler classes and both boolean values. It guards + against bugs like `self.frozen = freeze or False` (which would lose `True` + silently for the wrong branch) and against PythonCrawler regressions in + `super().__init__` plumbing. It also locks down the naming convention + (external `freeze` keyword → internal `frozen` attribute). + """ + assert factory('.', freeze=value).frozen is value + + +def test_freeze_filter_called_for_all_paths_during_snapshot_construction_before_first_yield(tmp_path: Path): + """ + With `freeze=True`, the user filter is invoked for every candidate path + during snapshot construction, before the iterator yields anything. + + This is the primary observable evidence of "snapshot is built before + iteration begins". A tracking filter records every path it sees; after the + very first `next()` on the iterator, the recorded set already equals the + full set of files in the directory, even though only one yield has + happened. This precludes the lazy interpretation where the filter would be + called incrementally with each yield. + """ + files = sorted(tmp_path / f'f{i}.txt' for i in range(5)) + for path in files: + path.touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + iterator = iter(Crawler(tmp_path, freeze=True, filter=tracking)) + first = next(iterator) + + assert set(seen) == set(files) + assert first in set(files) + + +def test_freeze_filter_not_called_again_during_remaining_iteration(tmp_path: Path): + """ + Once the snapshot has been materialised during the first `next()`, the + user filter is not invoked again while the remaining snapshot is yielded. + + The test complements C1: the snapshot is built exactly once and reused for + the rest of the iteration. We freeze the filter-call count immediately + after the first `next()`, drain the iterator, and verify the count has not + increased. + """ + files = [tmp_path / f'f{i}.txt' for i in range(5)] + for path in files: + path.touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + iterator = iter(Crawler(tmp_path, freeze=True, filter=tracking)) + next(iterator) + count_after_first = len(seen) + + list(iterator) + + assert len(seen) == count_after_first + + +def test_without_freeze_filter_is_called_lazily(tmp_path: Path): + """ + Without `freeze=True`, the filter — which always returns `True` — is + invoked lazily so that after the first `next()` only a strict subset of + paths has been seen. + + The contrast with C1 shows that the timing difference is exactly what + `freeze=True` changes. The filter is intentionally always-`True` so that + `len(seen) < N` cannot accidentally hold for some other reason (e.g. + selective filtering). The strict bound `0 < len(seen) < N` proves the + iteration is incremental. + """ + files = [tmp_path / f'f{i}.txt' for i in range(5)] + for path in files: + path.touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + iterator = iter(Crawler(tmp_path, filter=tracking)) + next(iterator) + + assert 0 < len(seen) < len(files) + + +def test_freeze_apply_processes_all_files_even_when_callback_deletes_them(tmp_path: Path): + """ + `apply()` with `freeze=True` should call the callback for every snapshot + path, even when the callback deletes files mid-iteration. + + This is the primary user-facing scenario from the spec: walk a directory + and remove each file. Without `freeze` the result is filesystem-dependent + because deletion races with `rglob`. With `freeze` the snapshot is built + up front, so the callback runs for every captured path regardless of how + it mutates the directory. + """ + files = [tmp_path / f'f{i}.txt' for i in range(7)] + for path in files: + path.touch() + + processed: List[Path] = [] + + def delete_callback(path: Path) -> None: + processed.append(path) + path.unlink() + + Crawler(tmp_path, freeze=True).apply(delete_callback) + + assert set(processed) == set(files) + for path in files: + assert not path.exists() + + +def test_freeze_does_not_yield_files_created_after_snapshot(tmp_path: Path): + """ + A file created between snapshot construction and the end of iteration + should not be yielded. + + The complement of C4 — the snapshot does not pick up new files that arrive + after construction. The test creates two files, starts iteration to force + snapshot materialisation, then creates a third file before draining the + rest; the yielded set is exactly the two original files. + """ + file1 = tmp_path / 'a.txt' + file2 = tmp_path / 'b.txt' + file1.touch() + file2.touch() + + iterator = iter(Crawler(tmp_path, freeze=True)) + first = next(iterator) + + late = tmp_path / 'late.txt' + late.touch() + + rest = list(iterator) + + yielded = {first, *rest} + + assert yielded == {file1, file2} + assert late not in yielded + + +@pytest.mark.parametrize( + 'mutate', + [ + lambda p: p.rename(p.parent / 'renamed.txt'), + lambda p: p.unlink(), + ], + ids=['rename', 'delete'], +) +def test_freeze_yields_snapshot_path_after_rename_or_delete(tmp_path: Path, mutate): + """ + Paths captured into the snapshot are yielded as-is regardless of whether + the underlying file is renamed or deleted after the snapshot has been built. + + The two `mutate` variants exercise inode-rename (reuse under a different + name) and outright deletion. In both cases the snapshot still yields the + original `Path` object — proving that the snapshot stores plain `Path` + values, not live filesystem references. + + The test creates exactly two files and consumes the first one to force + snapshot construction. It then asserts that exactly one file remains in + the snapshot (defensively rejecting the case where snapshot collapsed to a + single entry), mutates the remaining file, and confirms that the second + `next()` yields its original (pre-mutation) path. + """ + file1 = tmp_path / 'a.txt' + file2 = tmp_path / 'b.txt' + file1.touch() + file2.touch() + + iterator = iter(Crawler(tmp_path, freeze=True)) + first = next(iterator) + + expected_other_set = {file1, file2} - {first} + assert len(expected_other_set) == 1 + other = next(iter(expected_other_set)) + + mutate(other) + + assert next(iterator) == other + + with pytest.raises(StopIteration): + next(iterator) + + +@pytest.mark.skipif( + sys.platform == 'win32', + reason='file→directory replacement at the same path while a Path object is alive is unreliable on Windows.', +) +def test_freeze_yields_snapshot_path_after_replace_with_directory(tmp_path: Path): + """ + A path captured as a file in the snapshot must still be yielded after the + file has been unlinked and replaced by a directory of the same name. + + This is a third family of post-snapshot mutation — node-type change — that + complements rename and delete. The snapshot stores plain `Path` values, so + the on-disk identity of the path is irrelevant to what the iterator emits. + """ + file1 = tmp_path / 'a.txt' + file2 = tmp_path / 'b.txt' + file1.touch() + file2.touch() + + iterator = iter(Crawler(tmp_path, freeze=True)) + first = next(iterator) + + expected_other_set = {file1, file2} - {first} + assert len(expected_other_set) == 1 + other = next(iter(expected_other_set)) + + other.unlink() + other.mkdir() + + assert next(iterator) == other + + with pytest.raises(StopIteration): + next(iterator) + + +def test_freeze_token_cancelling_exactly_after_snapshot_built_yields_nothing(tmp_path: Path): + """ + A token that becomes cancelled exactly when snapshot construction + completes — before any path is yielded — should produce zero items, yet + the snapshot itself must have been built completely. + + The test closes the race window between the end of + `list(self._traverse(token))` and the first iteration of + `for path in snapshot:`. We use a counting filter and a ConditionToken + that flips to cancelled once the filter has been called for every file + (i.e. once the snapshot is about to close). The token-check in `go()` + before the first yield sees the cancellation and skips the entire yield + loop, while `len(seen) == N` confirms the snapshot was fully constructed + first. + """ + files = [tmp_path / f'f{i}.txt' for i in range(3)] + for path in files: + path.touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + token = ConditionToken(lambda: len(seen) >= len(files)) + crawler = Crawler(tmp_path, freeze=True, filter=tracking, token=token) + + assert list(crawler) == [] + assert len(seen) == len(files) + + +@pytest.mark.parametrize( + 'token_route', + [ + 'instance', + 'call', + ], +) +def test_freeze_with_already_cancelled_token_yields_nothing_and_skips_filter(tmp_path: Path, token_route: str): + """ + An already-cancelled token should suppress both snapshot construction and + yields, regardless of whether the token is passed to the constructor or + to `go(token=...)`. + + `go()` begins with `token = token + self.token`, so the two routes are + instrumentally equivalent. The test parametrises over both routes and + verifies that no path is yielded and that the user filter is never + invoked (the snapshot construction does not enter `rglob` when the + combined token is already cancelled). + """ + for index in range(3): + (tmp_path / f'f{index}.txt').touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + cancelled = SimpleToken(cancelled=True) + if token_route == 'instance': + crawler = Crawler(tmp_path, freeze=True, token=cancelled, filter=tracking) + result = list(crawler) + else: + crawler = Crawler(tmp_path, freeze=True, filter=tracking) + result = list(crawler.go(cancelled)) + + assert result == [] + assert seen == [] + + +@pytest.mark.parametrize( + 'partial_snapshot_size', + [ + 0, + 1, + 2, + 3, + ], +) +def test_freeze_with_condition_token_cancelling_mid_snapshot_yields_nothing_but_partial_snapshot_built(tmp_path: Path, partial_snapshot_size: int): + """ + A `ConditionToken` that cancels after the configured number of filter + calls should truncate the snapshot at that many entries during + construction, yet the snapshot-yield loop in `go()` yields zero items. + + Important semantic note: cantok tokens are terminal — once cancelled they + remain cancelled. So although `_traverse` stops appending after the + configured number of filter calls (snapshot has that length), the + freeze-branch `for path in snapshot: if not token: break` sees the + still-cancelled token and exits immediately. The lazy-mode test + `test_cancel_after_n_iterations` deliberately yields the first + `partial_snapshot_size` entries, which is not achievable under + `freeze=True` with the same token construction. The + truncation-during-build is instead asserted indirectly via + `len(seen) == partial_snapshot_size`. + """ + files = [tmp_path / f'f{index}.txt' for index in range(5)] + for path in files: + path.touch() + + seen: List[Path] = [] + + def tracking(path: Path) -> bool: + seen.append(path) + return True + + token = ConditionToken(lambda: len(seen) >= partial_snapshot_size) + crawler = Crawler(tmp_path, freeze=True, filter=tracking, token=token) + + assert list(crawler) == [] + assert len(seen) == partial_snapshot_size + + +@pytest.mark.parametrize( + 'factory', + [ + Crawler, + PythonCrawler, + ], +) +def test_freeze_apply_visits_every_snapshot_path(crawl_directory_path: Union[str, Path], factory: Type[Crawler]): + """ + `apply()` under `freeze=True` should call the callback exactly once for + every path the crawler would yield. + + The test confirms that `apply()` uses the same yield path as `go()` and + that this still holds in freeze mode. The callback simply records each + visited path; the result is compared with direct iteration over the same + frozen crawler. + """ + seen: List[Path] = [] + + factory(crawl_directory_path, freeze=True).apply(seen.append) + + assert seen == list(factory(crawl_directory_path, freeze=True)) + + +def test_freeze_apply_does_not_visit_files_created_by_callback(tmp_path: Path): + """ + Files newly created by the callback should not be visited within the same + `apply()` call. + + The test creates three files and an `apply()` callback that creates a new + `*_clone.txt` for every visited path. After `apply()` returns, the + recorded set is exactly the three original files; the clones exist on + disk but were not visited. + """ + originals = [tmp_path / f'f{i}.txt' for i in range(3)] + for path in originals: + path.touch() + + processed: List[Path] = [] + + def cloning_callback(path: Path) -> None: + processed.append(path) + (tmp_path / f'{path.stem}_clone.txt').touch() + + Crawler(tmp_path, freeze=True).apply(cloning_callback) + + assert set(processed) == set(originals) + for path in originals: + clone = tmp_path / f'{path.stem}_clone.txt' + assert clone.exists() + assert clone not in processed + + +def test_freeze_apply_called_twice_rebuilds_snapshot_each_time(tmp_path: Path): + """ + Two sequential `apply()` calls on the same frozen crawler should each + rebuild the snapshot, so a mutation done by the first call is reflected + by the second call. + + The test creates three files and uses one frozen crawler instance. The + first `apply()` deletes every visited file; the second `apply()` collects + visited paths into a list. Because nothing remains on disk between the + calls, the second call must visit nothing — which is only possible if the + snapshot is rebuilt freshly on every `apply()` invocation. + """ + files = [tmp_path / f'f{i}.txt' for i in range(3)] + for path in files: + path.touch() + + crawler = Crawler(tmp_path, freeze=True) + + def delete_callback(path: Path) -> None: + path.unlink() + + crawler.apply(delete_callback) + + assert all(not path.exists() for path in files) + + seen: List[Path] = [] + + crawler.apply(seen.append) + + assert seen == [] + + +@pytest.mark.parametrize( + 'token_route', + [ + 'instance', + 'call', + ], +) +def test_freeze_apply_with_cancelled_token_does_not_call_callback(crawl_directory_path: Union[str, Path], token_route: str): + """ + `apply()` under `freeze=True` with an already-cancelled token should not + invoke the callback — whether the token is set on the instance or passed + via `apply(token=...)`. + + The test parametrises both token-passing routes through + `AbstractCrawler.apply → self.go(token)` and verifies that the callback + is never called in either case. + """ + seen: List[Path] = [] + + cancelled = SimpleToken(cancelled=True) + + if token_route == 'instance': + Crawler(crawl_directory_path, freeze=True, token=cancelled).apply(seen.append) + else: + Crawler(crawl_directory_path, freeze=True).apply(seen.append, token=cancelled) + + assert seen == [] + + +@pytest.mark.parametrize( + 'mix', + [ + lambda p1, p2: Crawler(p1, freeze=True) + Crawler(p2, freeze=True), + lambda p1, p2: Crawler(p1, freeze=True) + Crawler(p2), + ], + ids=['both_frozen', 'one_frozen'], +) +def test_group_with_freeze_in_children_matches_unfrozen_group( + mix, + crawl_directory_path: Union[str, Path], + second_crawl_directory_path: Union[str, Path], +): + """ + A `CrawlersGroup` whose children are frozen (in part or in full) should + yield the same deduplicated union as a fully-unfrozen group on an + unchanging filesystem. + + The test parametrises two configurations — "both children frozen" and + "only the first frozen" — and verifies that both produce the same set of + paths as the group of unfrozen children. + """ + expected = set(Crawler(crawl_directory_path) + Crawler(second_crawl_directory_path)) + + assert set(mix(crawl_directory_path, second_crawl_directory_path)) == expected + + +def test_group_with_frozen_child_apply_with_deletion(tmp_path: Path): + """ + A `CrawlersGroup` whose children are frozen `Crawler` instances should + let `apply()` delete every visited file safely. + + The test creates two subdirectories with files, builds a group of two + frozen crawlers (one per subdirectory), and runs `apply()` with an + unlink-and-record callback. Every original file is recorded as visited + and removed from disk. + """ + sub_a = tmp_path / 'a' + sub_b = tmp_path / 'b' + sub_a.mkdir() + sub_b.mkdir() + files_a = [sub_a / f'a{i}.txt' for i in range(3)] + files_b = [sub_b / f'b{i}.txt' for i in range(3)] + for path in (*files_a, *files_b): + path.touch() + + processed: List[Path] = [] + + def callback(path: Path) -> None: + processed.append(path) + path.unlink() + + group = Crawler(sub_a, freeze=True) + Crawler(sub_b, freeze=True) + group.apply(callback) + + assert set(processed) == set(files_a) | set(files_b) + for path in (*files_a, *files_b): + assert not path.exists() + + +def test_crawlers_group_rejects_freeze_keyword(crawl_directory_path: Union[str, Path]): + """ + `CrawlersGroup.__init__` should reject a `freeze` keyword argument. + + The test guards the API contract spelled out in the plan: `freeze` is a + per-child `Crawler` option, never a group-level one. If a user tries to + pass `freeze=True` to a `CrawlersGroup` by analogy with `Crawler`, the + standard `TypeError` from Python's constructor should surface. + """ + if sys.version_info < (3, 10): + expected_message = "__init__() got an unexpected keyword argument 'freeze'" + else: + expected_message = "CrawlersGroup.__init__() got an unexpected keyword argument 'freeze'" + + with pytest.raises(TypeError, match=match(expected_message)): + CrawlersGroup([Crawler(crawl_directory_path)], freeze=True) # type: ignore[call-arg] diff --git a/tests/test_python_crawler.py b/tests/test_python_crawler.py index 317e00a..463136b 100644 --- a/tests/test_python_crawler.py +++ b/tests/test_python_crawler.py @@ -201,7 +201,9 @@ def test_python_apply_only_visits_py_files(crawl_directory_path: Union[str, Path compares the callback order with normal PythonCrawler iteration. """ seen: list = [] + PythonCrawler(crawl_directory_path).apply(seen.append) + assert seen assert all(p.suffix == '.py' for p in seen) assert seen == list(PythonCrawler(crawl_directory_path)) @@ -215,7 +217,9 @@ def test_python_apply_respects_exclude(crawl_directory_path: Union[str, Path]): with normal iteration using the same exclude option. """ seen: list = [] + PythonCrawler(crawl_directory_path, exclude=['__init__.py']).apply(seen.append) + assert seen == list(PythonCrawler(crawl_directory_path, exclude=['__init__.py'])) @@ -227,7 +231,9 @@ def test_python_apply_respects_custom_filter(crawl_directory_path: Union[str, Pa every callback input matches that predicate. """ seen: list = [] + PythonCrawler(crawl_directory_path, filter=lambda x: 'simple' in x.name).apply(seen.append) + assert seen assert all('simple' in p.name for p in seen) @@ -240,5 +246,50 @@ def test_python_apply_with_cancelled_token(crawl_directory_path: Union[str, Path not invoke the callback. """ seen: list = [] + PythonCrawler(crawl_directory_path, token=SimpleToken(cancelled=True)).apply(seen.append) + assert seen == [] + + +def test_python_crawler_freeze_yields_only_python_files(crawl_directory_path: Union[str, Path]): + """ + `PythonCrawler(freeze=True)` should yield exactly the same `.py` files as + the non-frozen `PythonCrawler` on an unchanging filesystem. + + The test confirms that plumbing `freeze` through `super().__init__` does + not interfere with the hardcoded `extensions=('.py',)` filter that + `PythonCrawler` applies on every yield. + """ + assert set(PythonCrawler(crawl_directory_path, freeze=True)) == set(PythonCrawler(crawl_directory_path)) + + +def test_python_crawler_freeze_apply_handles_deletion(tmp_path: Path): + """ + `PythonCrawler` with `freeze=True` should let `apply()` delete every + visited `.py` file safely while leaving non-Python files untouched. + + The test creates a mix of `.py` and `.txt` files in `tmp_path`, runs + `PythonCrawler(tmp_path, freeze=True).apply(...)` with an unlink-and- + record callback, and verifies that exactly the original `.py` files are + recorded and removed from disk while `.txt` files are still on disk and + never received the callback. + """ + py_files = [tmp_path / f'p{i}.py' for i in range(3)] + txt_files = [tmp_path / f't{i}.txt' for i in range(2)] + for path in (*py_files, *txt_files): + path.touch() + + processed: list = [] + + def callback(path: Path) -> None: + processed.append(path) + path.unlink() + + PythonCrawler(tmp_path, freeze=True).apply(callback) + + assert set(processed) == set(py_files) + for path in py_files: + assert not path.exists() + for path in txt_files: + assert path.exists() From dc49e023a277515131cc5801edd1017d540bc029 Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 04:08:12 +0300 Subject: [PATCH 5/6] Add the "Transactionality" block in the readme and describe the freeze flag there --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 159a78f..46286f5 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ There are many libraries for traversing directories. You can also do this using - [**Filtering**](#filtering) - [**Working with Cancellation Tokens**](#working-with-cancellation-tokens) - [**Combination**](#combination) +- [**Transactionality**](#transactionality) ## Installation @@ -178,3 +179,16 @@ for path in Crawler('../dirstree', '../cantok'): ``` > ↑ In this case, there is no deduplication of paths. + + +## Transactionality + +If you plan to modify the directory while iterating over it — for example, deleting or moving files inside an `apply()` callback — pass `freeze=True` to take a snapshot of every matching path up front, then iterate that snapshot instead of the live filesystem: + +```python +Crawler('path/to/directory', freeze=True).apply(lambda p: p.unlink()) +``` + +> ↑ The snapshot is built on the first step of iteration, with every filter and cancellation token already applied. After that, any creation, renaming or deletion happening in the directory does not affect what is yielded — each call to `go()` or `iter()` produces its own fresh snapshot. + +> ↑ Without `freeze=True` the order of yielded paths depends on the live state of the filesystem, so mid-iteration mutation may silently skip or duplicate entries. From 0bed865266324239a472ef1d7d9bc94cc1db77b0 Mon Sep 17 00:00:00 2001 From: pomponchik Date: Sun, 31 May 2026 04:19:11 +0300 Subject: [PATCH 6/6] Fix some tests --- tests/test_crawler.py | 120 +++++++++-------------------------- tests/test_python_crawler.py | 9 ++- 2 files changed, 37 insertions(+), 92 deletions(-) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index b788cac..812bbd5 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,4 +1,3 @@ -import errno import os import stat import sys @@ -527,42 +526,8 @@ def test_only_files_false_yields_symlink_nodes_when_supported(tmp_path: Path): @pytest.mark.skipif( - sys.platform == 'win32' or sys.version_info >= (3, 13), - reason='Path.rglob does not raise PermissionError for chmod(0) directories on Windows, and on Python 3.13+ pathlib silently skips inaccessible entries.', -) -@pytest.mark.parametrize( - 'freeze_kwargs', - [ - {}, - {'freeze': False}, - {'freeze': True}, - ], -) -def test_rglob_errors_propagate_with_only_files_false(tmp_path: Path, freeze_kwargs): - """ - Traversal errors from `Path.rglob` should not be swallowed. - - The test creates an unreadable directory and verifies that the crawler - propagates the same `PermissionError` that `Path.rglob` would surface. - """ - blocked = tmp_path / 'blocked' - blocked.mkdir() - (blocked / 'file.txt').write_text('content') - blocked.chmod(0) - - try: - with pytest.raises( - PermissionError, - match=match(str(PermissionError(errno.EACCES, os.strerror(errno.EACCES), str(blocked)))), - ): - list(Crawler(tmp_path, only_files=False, **freeze_kwargs)) - finally: - blocked.chmod(stat.S_IRWXU) - - -@pytest.mark.skipif( - sys.platform == 'win32' or sys.version_info < (3, 13), - reason='pathlib silently swallows OSError during traversal only on POSIX Python 3.13+.', + sys.platform == 'win32', + reason='Windows ignores chmod(0) on directories owned by the current user, so PermissionError is not produced.', ) @pytest.mark.parametrize( 'freeze_kwargs', @@ -572,19 +537,20 @@ def test_rglob_errors_propagate_with_only_files_false(tmp_path: Path, freeze_kwa {'freeze': True}, ], ) -def test_unreadable_subdirectory_is_silently_skipped_on_python_3_13_plus_posix(tmp_path: Path, freeze_kwargs): +def test_unreadable_subdirectory_is_silently_skipped_on_posix(tmp_path: Path, freeze_kwargs): """ - On POSIX Python 3.13+, `Path.rglob` deliberately swallows `OSError` (and - its `PermissionError` subclass) for inaccessible entries to match - `glob.glob` behaviour. The crawler must transparently inherit that - contract: an unreadable subdirectory does not raise — it just contributes - nothing to the result, while the rest of the tree iterates as usual. + On every Python version in the supported matrix, `pathlib.Path.rglob` + catches `PermissionError` from `os.scandir` internally and just stops + descending into the unreadable subtree — it does not raise. The crawler + must transparently inherit that contract: an unreadable subdirectory does + not break iteration, it simply contributes nothing to the result, while + the rest of the tree iterates as usual. - Mirror of `test_rglob_errors_propagate_with_only_files_false`, which is - skipped on this same combination of platform and Python version because - the propagation invariant simply does not apply there. Together the two - tests pin down `Crawler`'s observable behaviour around `OSError` from - `rglob` across the whole CI matrix. + The test pins down `Crawler`'s observable behaviour around `OSError` from + `rglob`: the visible peer file is yielded, the unreadable directory entry + itself is yielded (its parent's `scandir` listed it before the recursion + failed), and the file hiding inside the unreadable directory is silently + absent. No exception escapes. """ visible = tmp_path / 'visible.txt' visible.write_text('content') @@ -1829,33 +1795,6 @@ def test_apply_on_nonexistent_base_path_matches_iteration_behavior(tmp_path: Pat assert seen == iter_paths -@pytest.mark.skipif( - sys.platform == 'win32' or sys.version_info >= (3, 13), - reason='Path.rglob does not raise PermissionError for chmod(0) directories on Windows, and on Python 3.13+ pathlib silently skips inaccessible entries.', -) -def test_apply_propagates_rglob_errors_with_only_files_false(tmp_path: Path): - """ - `apply()` should propagate traversal errors from all-entity crawling. - - The test creates an unreadable directory and verifies that `apply()` via the - crawler surfaces the same `PermissionError` that direct `rglob` would raise. - """ - blocked = tmp_path / 'blocked' - blocked.mkdir() - (blocked / 'file.txt').write_text('content') - blocked.chmod(0) - seen: list = [] - - try: - with pytest.raises( - PermissionError, - match=match(str(PermissionError(errno.EACCES, os.strerror(errno.EACCES), str(blocked)))), - ): - Crawler(tmp_path, only_files=False).apply(seen.append) - finally: - blocked.chmod(stat.S_IRWXU) - - @pytest.mark.parametrize( 'freeze_kwargs', [ @@ -1956,7 +1895,7 @@ def test_freeze_filter_called_for_all_paths_during_snapshot_construction_before_ happened. This precludes the lazy interpretation where the filter would be called incrementally with each yield. """ - files = sorted(tmp_path / f'f{i}.txt' for i in range(5)) + files = [tmp_path / f'f{i}.txt' for i in range(5)] for path in files: path.touch() @@ -1978,10 +1917,10 @@ def test_freeze_filter_not_called_again_during_remaining_iteration(tmp_path: Pat Once the snapshot has been materialised during the first `next()`, the user filter is not invoked again while the remaining snapshot is yielded. - The test complements C1: the snapshot is built exactly once and reused for - the rest of the iteration. We freeze the filter-call count immediately - after the first `next()`, drain the iterator, and verify the count has not - increased. + The test complements the "all paths seen before first yield" companion: the + snapshot is built exactly once and reused for the rest of the iteration. + We freeze the filter-call count immediately after the first `next()`, + drain the iterator, and verify the count has not increased. """ files = [tmp_path / f'f{i}.txt' for i in range(5)] for path in files: @@ -2008,11 +1947,11 @@ def test_without_freeze_filter_is_called_lazily(tmp_path: Path): invoked lazily so that after the first `next()` only a strict subset of paths has been seen. - The contrast with C1 shows that the timing difference is exactly what - `freeze=True` changes. The filter is intentionally always-`True` so that - `len(seen) < N` cannot accidentally hold for some other reason (e.g. - selective filtering). The strict bound `0 < len(seen) < N` proves the - iteration is incremental. + The contrast with the `freeze=True` companion test shows that the timing + difference is exactly what `freeze=True` changes. The filter is + intentionally always-`True` so that `len(seen) < N` cannot accidentally + hold for some other reason (e.g. selective filtering). The strict bound + `0 < len(seen) < N` proves the iteration is incremental. """ files = [tmp_path / f'f{i}.txt' for i in range(5)] for path in files: @@ -2063,10 +2002,11 @@ def test_freeze_does_not_yield_files_created_after_snapshot(tmp_path: Path): A file created between snapshot construction and the end of iteration should not be yielded. - The complement of C4 — the snapshot does not pick up new files that arrive - after construction. The test creates two files, starts iteration to force - snapshot materialisation, then creates a third file before draining the - rest; the yielded set is exactly the two original files. + Counterpart of the delete-during-apply test: the snapshot does not pick up + new files that arrive after construction. The test creates two files, + starts iteration to force snapshot materialisation, then creates a third + file before draining the rest; the yielded set is exactly the two + original files. """ file1 = tmp_path / 'a.txt' file2 = tmp_path / 'b.txt' @@ -2421,7 +2361,7 @@ def test_group_with_freeze_in_children_matches_unfrozen_group( assert set(mix(crawl_directory_path, second_crawl_directory_path)) == expected -def test_group_with_frozen_child_apply_with_deletion(tmp_path: Path): +def test_group_with_frozen_children_apply_with_deletion(tmp_path: Path): """ A `CrawlersGroup` whose children are frozen `Crawler` instances should let `apply()` delete every visited file safely. diff --git a/tests/test_python_crawler.py b/tests/test_python_crawler.py index 463136b..1ebb7ea 100644 --- a/tests/test_python_crawler.py +++ b/tests/test_python_crawler.py @@ -259,9 +259,14 @@ def test_python_crawler_freeze_yields_only_python_files(crawl_directory_path: Un The test confirms that plumbing `freeze` through `super().__init__` does not interfere with the hardcoded `extensions=('.py',)` filter that - `PythonCrawler` applies on every yield. + `PythonCrawler` applies on every yield. The non-emptiness assertion + guards against a vacuous pass if the fixture were ever to lose its `.py` + files. """ - assert set(PythonCrawler(crawl_directory_path, freeze=True)) == set(PythonCrawler(crawl_directory_path)) + frozen = set(PythonCrawler(crawl_directory_path, freeze=True)) + + assert frozen + assert frozen == set(PythonCrawler(crawl_directory_path)) def test_python_crawler_freeze_apply_handles_deletion(tmp_path: Path):