From 57390d1520853725f5c37e519f67c11047ed3607 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 22:53:14 +0200 Subject: [PATCH 01/32] add is_displayed_on_page function with caching --- pypdf/_page.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 383be98f4d..594b401c23 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -30,7 +30,7 @@ import math from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from decimal import Decimal from io import BytesIO from pathlib import Path @@ -351,6 +351,13 @@ class ImageFile: Reference to the object storing the stream. """ + _displayed_pages: dict[int, bool] = field(default_factory=dict) + """ + Cached dictionary mapping page numbers to display status. + Used for performance optimization when checking multiple pages. + True = displayed, False = not displayed. + """ + def replace(self, new_image: Image, **kwargs: Any) -> None: """ Replace the image with a new PIL image. @@ -414,6 +421,114 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" + def is_displayed_on_page(self, page_number: int) -> bool: + """ + Check if this image is displayed on the specified page. + + This method determines whether an image is actually rendered on a page + (not just referenced in resources). It checks the page's content stream + for image operators. + + Args: + page_number: The page number to check (0-indexed). + + Returns: + True if the image is displayed on the page, False otherwise. + Returns cached result for pages already checked for performance. + + Examples: + >>> from pypdf import PdfReader + >>> reader = PdfReader("example.pdf") + >>> image = reader.pages[0].images[0] + >>> image.is_displayed_on_page(0) # Check if displayed on page 0 + True + >>> image.is_displayed_on_page(1) # Check if displayed on page 1 + False + """ + # Return cached result if already checked + if page_number in self._displayed_pages: + return self._displayed_pages[page_number] + + # Check if this is an inline image or XObject image + # Inline images have names starting with "~" + if self.name.startswith("~"): + result = self._check_inline_image_displayed(page_number) + else: + result = self._check_xobject_image_displayed(page_number) + + # Cache the result + self._displayed_pages[page_number] = result + return result + + def _check_inline_image_displayed(self, page_number: int) -> bool: + """ + Check if an inline image is displayed on a page. + + Inline images appear in the content stream as "INLINE IMAGE" operators. + The image name starts with "~" and is the first operand of the operator. + + Args: + page_number: The page number to check. + + Returns: + True if the inline image is displayed on the page. + """ + from .generic._data_structures import ContentStream # noqa: PLC0415 + + try: + page = cast("PageObject", self.indirect_reference.pdf.pages[page_number]) + raw_contents = page.get(NameObject("/Contents"), None) + + stream = ContentStream(raw_contents, self.indirect_reference.pdf) + + for operands, operator in stream.operations: + if operator == b"INLINE IMAGE": + # First operand is the inline image name + if operands and operands[0] == self.name: + return True + except (KeyError, IndexError, AttributeError): + pass + + return False + + def _check_xobject_image_displayed(self, page_number: int) -> bool: + """ + Check if an XObject image is displayed on a page. + + XObject images appear in the content stream as "Do" operators. + The image name is the first operand of the Do operator. + The name may have a leading "/" that needs to be stripped. + + Args: + page_number: The page number to check. + + Returns: + True if the XObject image is displayed on the page. + """ + from .generic._data_structures import ContentStream # noqa: PLC0415 + + try: + page = cast("PageObject", self.indirect_reference.pdf.pages[page_number]) + raw_contents = page.get(NameObject("/Contents"), None) + + stream = ContentStream(raw_contents, self.indirect_reference.pdf) + + for operands, operator in stream.operations: + if operator == b"Do": + # First operand is the XObject name (may have leading /) + if operands: + xobj_name = str(operands[0]) + # Compare base names (without extension like .jp2) + img_base = self.name.split(".")[0].lstrip("/") + xobj_base = xobj_name.lstrip("/") + + if img_base == xobj_base: + return True + except (KeyError, IndexError, AttributeError): + pass + + return False + class VirtualListImages(Sequence[ImageFile]): """ From ff16713524dec8d4c41cdd973dd0673d94d0b8c7 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:02:52 +0200 Subject: [PATCH 02/32] remove unneeded castings --- pypdf/_page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 594b401c23..d25a54ca36 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -476,7 +476,7 @@ def _check_inline_image_displayed(self, page_number: int) -> bool: from .generic._data_structures import ContentStream # noqa: PLC0415 try: - page = cast("PageObject", self.indirect_reference.pdf.pages[page_number]) + page = self.indirect_reference.pdf.pages[page_number] raw_contents = page.get(NameObject("/Contents"), None) stream = ContentStream(raw_contents, self.indirect_reference.pdf) @@ -508,7 +508,7 @@ def _check_xobject_image_displayed(self, page_number: int) -> bool: from .generic._data_structures import ContentStream # noqa: PLC0415 try: - page = cast("PageObject", self.indirect_reference.pdf.pages[page_number]) + page = self.indirect_reference.pdf.pages[page_number] raw_contents = page.get(NameObject("/Contents"), None) stream = ContentStream(raw_contents, self.indirect_reference.pdf) From cf232f9988ded833ee7a1f649f46353bf3a70748 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:43:33 +0200 Subject: [PATCH 03/32] comply with linter --- pypdf/_page.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index d25a54ca36..b81c76f322 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -482,10 +482,9 @@ def _check_inline_image_displayed(self, page_number: int) -> bool: stream = ContentStream(raw_contents, self.indirect_reference.pdf) for operands, operator in stream.operations: - if operator == b"INLINE IMAGE": - # First operand is the inline image name - if operands and operands[0] == self.name: - return True + # First operand is the inline image name + if operator == b"INLINE IMAGE" and operands and operands[0] == self.name: + return True except (KeyError, IndexError, AttributeError): pass @@ -514,16 +513,15 @@ def _check_xobject_image_displayed(self, page_number: int) -> bool: stream = ContentStream(raw_contents, self.indirect_reference.pdf) for operands, operator in stream.operations: - if operator == b"Do": - # First operand is the XObject name (may have leading /) - if operands: - xobj_name = str(operands[0]) - # Compare base names (without extension like .jp2) - img_base = self.name.split(".")[0].lstrip("/") - xobj_base = xobj_name.lstrip("/") - - if img_base == xobj_base: - return True + # First operand is the XObject name (may have leading /) + if operator == b"Do" and operands: + xobj_name = str(operands[0]) + # Compare base names (without extension like .jp2) + img_base = self.name.split(".")[0].lstrip("/") + xobj_base = xobj_name.lstrip("/") + + if img_base == xobj_base: + return True except (KeyError, IndexError, AttributeError): pass From 4d1ca4ebd7ab1235ec6a50e829be9b0c422961dc Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:47:05 +0200 Subject: [PATCH 04/32] comply with linter --- pypdf/_page.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index b81c76f322..7f06888073 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -476,6 +476,8 @@ def _check_inline_image_displayed(self, page_number: int) -> bool: from .generic._data_structures import ContentStream # noqa: PLC0415 try: + if not self.indirect_reference: + return False page = self.indirect_reference.pdf.pages[page_number] raw_contents = page.get(NameObject("/Contents"), None) @@ -507,6 +509,8 @@ def _check_xobject_image_displayed(self, page_number: int) -> bool: from .generic._data_structures import ContentStream # noqa: PLC0415 try: + if not self.indirect_reference: + return False page = self.indirect_reference.pdf.pages[page_number] raw_contents = page.get(NameObject("/Contents"), None) From 672ad47f2886294c797e627d900a7c484f6a6391 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:54:33 +0200 Subject: [PATCH 05/32] remove example --- pypdf/_page.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 7f06888073..779fb3da71 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -435,15 +435,6 @@ def is_displayed_on_page(self, page_number: int) -> bool: Returns: True if the image is displayed on the page, False otherwise. Returns cached result for pages already checked for performance. - - Examples: - >>> from pypdf import PdfReader - >>> reader = PdfReader("example.pdf") - >>> image = reader.pages[0].images[0] - >>> image.is_displayed_on_page(0) # Check if displayed on page 0 - True - >>> image.is_displayed_on_page(1) # Check if displayed on page 1 - False """ # Return cached result if already checked if page_number in self._displayed_pages: From 069d5c5fa3ff0774d21a17544f17f19d2fb617d4 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:56:18 +0200 Subject: [PATCH 06/32] add minimal test --- tests/test_images.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_images.py b/tests/test_images.py index fa05396795..cd11391f0c 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -658,3 +658,27 @@ def test_get_ids_image__resources_is_none(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[2] assert list(page.images.items()) == [] + + +def test_is_xobject_image_displayed(): + """Test XObject image display detection with expected results.""" + reader = PdfReader("local_test/example.pdf") + + # Based on local_test/example.py analysis: + # Page 1: Im8.jp2 displayed, Im20.jp2 not displayed + # Page 2: Neither displayed + # Page 3: Im20.jp2 displayed, Im8.jp2 not displayed + expected_results = [ + (0, "/Im20", False), + (0, "/Im8", True), + (1, "/Im20", False), + (1, "/Im8", False), + (2, "/Im20", True), + (2, "/Im8", False), + ] + + for page_num, image_id, expected in expected_results: + img = reader.pages[page_num].images[image_id] + is_used = img.is_displayed_on_page(page_num) + assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" + assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" From ff80e2e7ef31785e042e120ffcadd4ee76107f85 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 20 Apr 2026 23:58:35 +0200 Subject: [PATCH 07/32] comply with linter --- tests/test_images.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index cd11391f0c..75d5d819e4 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -663,7 +663,7 @@ def test_get_ids_image__resources_is_none(): def test_is_xobject_image_displayed(): """Test XObject image display detection with expected results.""" reader = PdfReader("local_test/example.pdf") - + # Based on local_test/example.py analysis: # Page 1: Im8.jp2 displayed, Im20.jp2 not displayed # Page 2: Neither displayed @@ -676,7 +676,7 @@ def test_is_xobject_image_displayed(): (2, "/Im20", True), (2, "/Im8", False), ] - + for page_num, image_id, expected in expected_results: img = reader.pages[page_num].images[image_id] is_used = img.is_displayed_on_page(page_num) From dd2ac2eaf7db03aa713f2c71f0cb88f4db701735 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Tue, 21 Apr 2026 22:42:01 +0200 Subject: [PATCH 08/32] fix docstring and pdf path in test_is_xobject_image_displayed, add pytest decorator --- tests/test_images.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index 75d5d819e4..9a769c7a45 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -660,11 +660,12 @@ def test_get_ids_image__resources_is_none(): assert list(page.images.items()) == [] +@pytest.mark.samples def test_is_xobject_image_displayed(): - """Test XObject image display detection with expected results.""" - reader = PdfReader("local_test/example.pdf") + """This test ensures that only actually displayed images are detected by `ImageFile.is_displayed_on_page`""" + path = SAMPLE_ROOT / "027-image-references-deduplication/wrong-references.pdf" + reader = PdfReader(path) - # Based on local_test/example.py analysis: # Page 1: Im8.jp2 displayed, Im20.jp2 not displayed # Page 2: Neither displayed # Page 3: Im20.jp2 displayed, Im8.jp2 not displayed From 27fc2bb2883b6140b141e41736976882c9c40299 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Wed, 22 Apr 2026 00:04:32 +0200 Subject: [PATCH 09/32] switch from page_number to page as is_displayed_on_page input --- pypdf/_page.py | 52 +++++++++++++++++---------------------- tests/test_images.py | 58 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 70 insertions(+), 40 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 779fb3da71..e3c01986ce 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -351,9 +351,10 @@ class ImageFile: Reference to the object storing the stream. """ - _displayed_pages: dict[int, bool] = field(default_factory=dict) + _displayed_pages: list["PageObject"] = field(default_factory=list) + _displayed_pages_status: list[bool] = field(default_factory=list) """ - Cached dictionary mapping page numbers to display status. + Cached pages and display statuses, with same ordering. Used for performance optimization when checking multiple pages. True = displayed, False = not displayed. """ @@ -421,7 +422,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" - def is_displayed_on_page(self, page_number: int) -> bool: + def is_displayed_on_page(self, page: "PageObject") -> bool: """ Check if this image is displayed on the specified page. @@ -430,28 +431,32 @@ def is_displayed_on_page(self, page_number: int) -> bool: for image operators. Args: - page_number: The page number to check (0-indexed). + page: The page object to check. Returns: True if the image is displayed on the page, False otherwise. Returns cached result for pages already checked for performance. """ # Return cached result if already checked - if page_number in self._displayed_pages: - return self._displayed_pages[page_number] + try: + displayed_page_index = self._displayed_pages.index(page) + return self._displayed_pages_status[displayed_page_index] + except ValueError: + pass # Check if this is an inline image or XObject image # Inline images have names starting with "~" if self.name.startswith("~"): - result = self._check_inline_image_displayed(page_number) + result = self._check_inline_image_displayed(page) else: - result = self._check_xobject_image_displayed(page_number) + result = self._check_xobject_image_displayed(page) # Cache the result - self._displayed_pages[page_number] = result + self._displayed_pages.append(page) + self._displayed_pages_status.append(result) return result - def _check_inline_image_displayed(self, page_number: int) -> bool: + def _check_inline_image_displayed(self, page: "PageObject") -> bool: """ Check if an inline image is displayed on a page. @@ -459,31 +464,18 @@ def _check_inline_image_displayed(self, page_number: int) -> bool: The image name starts with "~" and is the first operand of the operator. Args: - page_number: The page number to check. + page: The page to check. Returns: True if the inline image is displayed on the page. """ - from .generic._data_structures import ContentStream # noqa: PLC0415 - - try: - if not self.indirect_reference: - return False - page = self.indirect_reference.pdf.pages[page_number] - raw_contents = page.get(NameObject("/Contents"), None) - - stream = ContentStream(raw_contents, self.indirect_reference.pdf) - - for operands, operator in stream.operations: - # First operand is the inline image name - if operator == b"INLINE IMAGE" and operands and operands[0] == self.name: - return True - except (KeyError, IndexError, AttributeError): - pass + image_name = self.name.split(".")[0] + if page.inline_images: + return len(list(filter(lambda i: i == image_name, page.inline_images.keys()))) > 0 return False - def _check_xobject_image_displayed(self, page_number: int) -> bool: + def _check_xobject_image_displayed(self, page: "PageObject") -> bool: """ Check if an XObject image is displayed on a page. @@ -492,7 +484,7 @@ def _check_xobject_image_displayed(self, page_number: int) -> bool: The name may have a leading "/" that needs to be stripped. Args: - page_number: The page number to check. + page: The page to check. Returns: True if the XObject image is displayed on the page. @@ -502,7 +494,7 @@ def _check_xobject_image_displayed(self, page_number: int) -> bool: try: if not self.indirect_reference: return False - page = self.indirect_reference.pdf.pages[page_number] + raw_contents = page.get(NameObject("/Contents"), None) stream = ContentStream(raw_contents, self.indirect_reference.pdf) diff --git a/tests/test_images.py b/tests/test_images.py index 9a769c7a45..1749b1d11a 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -7,6 +7,7 @@ from io import BytesIO from pathlib import Path +from time import perf_counter from typing import Union from unittest import mock from zipfile import ZipFile @@ -662,24 +663,61 @@ def test_get_ids_image__resources_is_none(): @pytest.mark.samples def test_is_xobject_image_displayed(): - """This test ensures that only actually displayed images are detected by `ImageFile.is_displayed_on_page`""" + """ + This test ensures that only actually displayed referenced images + are detected by `ImageFile.is_displayed_on_page` + """ path = SAMPLE_ROOT / "027-image-references-deduplication/wrong-references.pdf" reader = PdfReader(path) + pages = reader.pages + page_1, page_2, page_3 = pages # Page 1: Im8.jp2 displayed, Im20.jp2 not displayed # Page 2: Neither displayed # Page 3: Im20.jp2 displayed, Im8.jp2 not displayed expected_results = [ - (0, "/Im20", False), - (0, "/Im8", True), - (1, "/Im20", False), - (1, "/Im8", False), - (2, "/Im20", True), - (2, "/Im8", False), + (page_1, 0, "/Im20", False), + (page_1, 0, "/Im8", True), + (page_2, 1, "/Im20", False), + (page_2, 1, "/Im8", False), + (page_3, 2, "/Im20", True), + (page_3, 2, "/Im8", False), + ] + + start_no_cache = perf_counter() + for page, page_num, image_id, expected in expected_results: + img = page.images[image_id] + is_used = img.is_displayed_on_page(page) + assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" + assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" + end_no_cache = perf_counter() + + start_cache = perf_counter() + for page, page_num, image_id, expected in expected_results: + img = page.images[image_id] + is_used = img.is_displayed_on_page(page) + assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" + assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" + end_cache = perf_counter() + + # Check caching improves performance by at least 100x + assert (end_cache-start_cache) < (end_no_cache-start_no_cache) + +@pytest.mark.samples +def test_is_inline_image_displayed(): + """This test ensures that displayed inline images are detected by `ImageFile.is_displayed_on_page`""" + path = SAMPLE_ROOT / "008-reportlab-inline-image/inline-image.pdf" + reader = PdfReader(path) + pages = reader.pages + page_1 = pages[0] + + # Page 1: + expected_results = [ + (page_1, 0, "~0~", True), ] - for page_num, image_id, expected in expected_results: - img = reader.pages[page_num].images[image_id] - is_used = img.is_displayed_on_page(page_num) + for page, page_num, image_id, expected in expected_results: + img = page.images[image_id] + is_used = img.is_displayed_on_page(page) assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" From 5f59487d969ff663f7f9fc8e62506a8d7fc754e6 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Wed, 22 Apr 2026 00:09:33 +0200 Subject: [PATCH 10/32] temporarily remove is_displayed_on_page caching --- pypdf/_page.py | 21 +-------------------- tests/test_images.py | 14 -------------- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index e3c01986ce..49d77fb642 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -30,7 +30,7 @@ import math from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy -from dataclasses import asdict, dataclass, field +from dataclasses import asdict, dataclass from decimal import Decimal from io import BytesIO from pathlib import Path @@ -351,14 +351,6 @@ class ImageFile: Reference to the object storing the stream. """ - _displayed_pages: list["PageObject"] = field(default_factory=list) - _displayed_pages_status: list[bool] = field(default_factory=list) - """ - Cached pages and display statuses, with same ordering. - Used for performance optimization when checking multiple pages. - True = displayed, False = not displayed. - """ - def replace(self, new_image: Image, **kwargs: Any) -> None: """ Replace the image with a new PIL image. @@ -435,15 +427,7 @@ def is_displayed_on_page(self, page: "PageObject") -> bool: Returns: True if the image is displayed on the page, False otherwise. - Returns cached result for pages already checked for performance. """ - # Return cached result if already checked - try: - displayed_page_index = self._displayed_pages.index(page) - return self._displayed_pages_status[displayed_page_index] - except ValueError: - pass - # Check if this is an inline image or XObject image # Inline images have names starting with "~" if self.name.startswith("~"): @@ -451,9 +435,6 @@ def is_displayed_on_page(self, page: "PageObject") -> bool: else: result = self._check_xobject_image_displayed(page) - # Cache the result - self._displayed_pages.append(page) - self._displayed_pages_status.append(result) return result def _check_inline_image_displayed(self, page: "PageObject") -> bool: diff --git a/tests/test_images.py b/tests/test_images.py index 1749b1d11a..5f58af7727 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -7,7 +7,6 @@ from io import BytesIO from pathlib import Path -from time import perf_counter from typing import Union from unittest import mock from zipfile import ZipFile @@ -684,24 +683,11 @@ def test_is_xobject_image_displayed(): (page_3, 2, "/Im8", False), ] - start_no_cache = perf_counter() for page, page_num, image_id, expected in expected_results: img = page.images[image_id] is_used = img.is_displayed_on_page(page) assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" - end_no_cache = perf_counter() - - start_cache = perf_counter() - for page, page_num, image_id, expected in expected_results: - img = page.images[image_id] - is_used = img.is_displayed_on_page(page) - assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" - assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" - end_cache = perf_counter() - - # Check caching improves performance by at least 100x - assert (end_cache-start_cache) < (end_no_cache-start_no_cache) @pytest.mark.samples def test_is_inline_image_displayed(): From 58c75a6e5ac9512def20e44655d0996494d613c5 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Wed, 22 Apr 2026 22:07:13 +0200 Subject: [PATCH 11/32] switch display check to image constructor --- pypdf/_page.py | 88 +++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 54 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index bdd36c07c1..ea45965f43 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -351,6 +351,18 @@ class ImageFile: Reference to the object storing the stream. """ + is_inline: bool = False + """ + True if this is an inline image (~0~, ~1~, etc.). + """ + + is_displayed: bool = False + """ + True if this image is displayed in the page content stream. + Some PDFs duplicate image references over all the pages, + so this is needed to disambiguate. + """ + def replace(self, new_image: Image, **kwargs: Any) -> None: """ Replace the image with a new PIL image. @@ -414,67 +426,28 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" - def is_displayed_on_page(self, page: "PageObject") -> bool: + def _check_displayed(self, page: "PageObject") -> None: """ - Check if this image is displayed on the specified page. - - This method determines whether an image is actually rendered on a page - (not just referenced in resources). It checks the page's content stream - for image operators. - - Args: - page: The page object to check. - - Returns: - True if the image is displayed on the page, False otherwise. - """ - # Check if this is an inline image or XObject image - # Inline images have names starting with "~" - if self.name.startswith("~"): - result = self._check_inline_image_displayed(page) - else: - result = self._check_xobject_image_displayed(page) - - return result - - def _check_inline_image_displayed(self, page: "PageObject") -> bool: - """ - Check if an inline image is displayed on a page. - - Inline images appear in the content stream as "INLINE IMAGE" operators. - The image name starts with "~" and is the first operand of the operator. + Check if this image is displayed in the page content stream. Args: page: The page to check. - Returns: - True if the inline image is displayed on the page. + Sets: + is_displayed: True if the image is displayed, False otherwise. """ - image_name = self.name.split(".")[0] - - if page.inline_images: - return len(list(filter(lambda i: i == image_name, page.inline_images.keys()))) > 0 - return False - - def _check_xobject_image_displayed(self, page: "PageObject") -> bool: - """ - Check if an XObject image is displayed on a page. - - XObject images appear in the content stream as "Do" operators. - The image name is the first operand of the Do operator. - The name may have a leading "/" that needs to be stripped. - - Args: - page: The page to check. + # Inline images are always displayed + if self.is_inline: + self.is_displayed = True + return - Returns: - True if the XObject image is displayed on the page. - """ + # Check XObject images in content stream from .generic._data_structures import ContentStream # noqa: PLC0415 try: if not self.indirect_reference: - return False + self.is_displayed = False + return raw_contents = page.get(NameObject("/Contents"), None) @@ -489,11 +462,12 @@ def _check_xobject_image_displayed(self, page: "PageObject") -> bool: xobj_base = xobj_name.lstrip("/") if img_base == xobj_base: - return True + self.is_displayed = True + return except (KeyError, IndexError, AttributeError): pass - return False + self.is_displayed = False class VirtualListImages(Sequence[ImageFile]): @@ -734,18 +708,24 @@ def _get_image( self.inline_images = self._get_inline_images() if self.inline_images is None: raise KeyError("No inline image can be found") - return self.inline_images[id] + img = self.inline_images[id] + img.is_inline = True + img.is_displayed = True + return img assert xobjs is not None from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] - return ImageFile( + img = ImageFile( name=f"{id[1:]}{extension}", data=byte_stream, image=imgd[2], indirect_reference=xobjs[id].indirect_reference, ) + img.is_inline = False + img._check_displayed(self) + return img # in a subobject assert xobjs is not None ids = id[1:] From 2966ee237f9e5c457e5ec94d3174e969377dbb80 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Wed, 22 Apr 2026 22:12:37 +0200 Subject: [PATCH 12/32] fix tests to use the new is_displayed property --- tests/test_images.py | 48 ++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index 5f58af7727..e06279bd70 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -229,7 +229,7 @@ def test_image_extraction(src, page_index, image_key, expected): def test_get_inline_image_without_xobject_resources(): page = PageObject(None, None) - inline_image = object() + inline_image = type("Mock", (), {"is_inline": True, "is_displayed": True})() with mock.patch.object(page, "_get_inline_images", return_value={"~0~": inline_image}): assert page._get_image("~0~") is inline_image @@ -662,48 +662,38 @@ def test_get_ids_image__resources_is_none(): @pytest.mark.samples def test_is_xobject_image_displayed(): - """ - This test ensures that only actually displayed referenced images - are detected by `ImageFile.is_displayed_on_page` - """ + """Test XObject image display detection with expected results.""" path = SAMPLE_ROOT / "027-image-references-deduplication/wrong-references.pdf" reader = PdfReader(path) - pages = reader.pages - page_1, page_2, page_3 = pages - # Page 1: Im8.jp2 displayed, Im20.jp2 not displayed - # Page 2: Neither displayed - # Page 3: Im20.jp2 displayed, Im8.jp2 not displayed expected_results = [ - (page_1, 0, "/Im20", False), - (page_1, 0, "/Im8", True), - (page_2, 1, "/Im20", False), - (page_2, 1, "/Im8", False), - (page_3, 2, "/Im20", True), - (page_3, 2, "/Im8", False), + # Page 1: /Im20 not displayed, /Im8 displayed + (0, "/Im20", False), + (0, "/Im8", True), + # Page 2: Neither displayed + (1, "/Im20", False), + (1, "/Im8", False), + # Page 3: /Im20 displayed, /Im8 not displayed + (2, "/Im20", True), + (2, "/Im8", False), ] - for page, page_num, image_id, expected in expected_results: - img = page.images[image_id] - is_used = img.is_displayed_on_page(page) - assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" - assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" + for page_num, image_id, expected in expected_results: + img = reader.pages[page_num].images[image_id] + assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" @pytest.mark.samples def test_is_inline_image_displayed(): - """This test ensures that displayed inline images are detected by `ImageFile.is_displayed_on_page`""" + """This test ensures that displayed inline images are detected by `ImageFile.is_displayed`""" path = SAMPLE_ROOT / "008-reportlab-inline-image/inline-image.pdf" reader = PdfReader(path) - pages = reader.pages - page_1 = pages[0] # Page 1: expected_results = [ - (page_1, 0, "~0~", True), + (0, "~0~", True), ] - for page, page_num, image_id, expected in expected_results: + for page_num, image_id, expected in expected_results: + page = reader.pages[page_num] img = page.images[image_id] - is_used = img.is_displayed_on_page(page) - assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}" - assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}" + assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" From 14a56d7047a956c5d8c1830851b703a42059bf7b Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Thu, 30 Apr 2026 23:09:04 +0200 Subject: [PATCH 13/32] move image displayed check to page initialization --- pypdf/_page.py | 185 +++++++++++++++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 68 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 376d0aee8e..15da6ff3ef 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -426,49 +426,6 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__str__()[:-1] + f", hash: {hash(self.data)})" - def _check_displayed(self, page: "PageObject") -> None: - """ - Check if this image is displayed in the page content stream. - - Args: - page: The page to check. - - Sets: - is_displayed: True if the image is displayed, False otherwise. - """ - # Inline images are always displayed - if self.is_inline: - self.is_displayed = True - return - - # Check XObject images in content stream - from .generic._data_structures import ContentStream # noqa: PLC0415 - - try: - if not self.indirect_reference: - self.is_displayed = False - return - - raw_contents = page.get(NameObject("/Contents"), None) - - stream = ContentStream(raw_contents, self.indirect_reference.pdf) - - for operands, operator in stream.operations: - # First operand is the XObject name (may have leading /) - if operator == b"Do" and operands: - xobj_name = str(operands[0]) - # Compare base names (without extension like .jp2) - img_base = self.name.split(".")[0].lstrip("/") - xobj_base = xobj_name.lstrip("/") - - if img_base == xobj_base: - self.is_displayed = True - return - except (KeyError, IndexError, AttributeError): - pass - - self.is_displayed = False - class VirtualListImages(Sequence[ImageFile]): """ @@ -559,7 +516,7 @@ def __init__( ) -> None: DictionaryObject.__init__(self) self.pdf = pdf - self.inline_images: Optional[dict[str, ImageFile]] = None + self.displayed_images: Optional[dict[str, ImageFile]] = None self.indirect_reference = indirect_reference if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" @@ -655,8 +612,8 @@ def _get_ids_image( if _i in call_stack: return [] call_stack.append(_i) - if self.inline_images is None: - self.inline_images = self._get_inline_images() + if self.displayed_images is None: + self.displayed_images = self._parse_images_from_content_stream() if obj is None: obj = self if ancest is None: @@ -667,19 +624,42 @@ def _get_ids_image( is_null_or_none(resources := obj[PG.RESOURCES]) or RES.XOBJECT not in cast(DictionaryObject, resources) ): - return [] if self.inline_images is None else list(self.inline_images.keys()) + return [] if self.displayed_images is None else list(self.displayed_images.keys()) x_object = resources[RES.XOBJECT].get_object() # type: ignore + + # Iterate through all XObject resources for o in x_object: + # Skip non-stream objects (only process StreamObject) if not isinstance(x_object[o], StreamObject): continue + + # Check if this XObject is an Image if x_object[o][IA.SUBTYPE] == "/Image": + # Add the image ID (with ancestry if needed) + # When ancest is empty, o is top-level: "/I0" + # When ancest is not empty, [ancest, o] is nested: ["/Form1", "/I0"] lst.append(o if len(ancest) == 0 else [*ancest, o]) - else: # is a form with possible images inside + + # If it's a form, recursively search for images inside it + else: + # Forms may contain images that are Do-referenced in their content stream lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) - assert self.inline_images is not None - lst.extend(list(self.inline_images.keys())) - return lst + + # Removes duplicates and preserves order + deduplicated = [] + for item in lst: + if item not in deduplicated: + deduplicated.append(item) + + # Add inline images (they may overlap with XObject images) + # Preserves order + # Inline images have names starting with ~ (e.g., ~0~, ~1~) + for k in self.displayed_images: + if k not in deduplicated: + deduplicated.append(k) + + return deduplicated def _get_image( self, @@ -704,28 +684,33 @@ def _get_image( ) from exc if isinstance(id, str): if id[0] == "~" and id[-1] == "~": - if self.inline_images is None: - self.inline_images = self._get_inline_images() - if self.inline_images is None: + if self.displayed_images is None: + self.displayed_images = self._parse_images_from_content_stream() + if self.displayed_images is None: raise KeyError("No inline image can be found") - img = self.inline_images[id] + img = self.displayed_images[id] img.is_inline = True img.is_displayed = True return img assert xobjs is not None + # Check if image is in content stream (from _parse_images_from_content_stream) + if self.displayed_images and id in self.displayed_images: + img = self.displayed_images[id] + img.is_inline = False + return img + from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] - img = ImageFile( + return ImageFile( name=f"{id[1:]}{extension}", data=byte_stream, image=imgd[2], indirect_reference=xobjs[id].indirect_reference, + is_inline=False, + is_displayed=False, # XObject images from resources only (not in content stream) ) - img.is_inline = False - img._check_displayed(self) - return img # in a subobject assert xobjs is not None ids = id[1:] @@ -754,7 +739,9 @@ def images(self) -> VirtualListImages: * `.name` : name of the object * `.data` : bytes of the object * `.image` : PIL Image Object - * `.indirect_reference` : object reference + * `.indirect_reference` : object reference (None for inline images) + * `.is_inline` : True for inline images (~0~, ~1~...), False for XObjects + * `.is_displayed` : True for images found in content stream, False otherwise and the following methods: `.replace(new_image: PIL.Image.Image, **kwargs)` : @@ -765,9 +752,6 @@ def images(self) -> VirtualListImages: reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) - Inline images are extracted and named ~0~, ~1~, ..., with the - indirect_reference set to None. - """ return VirtualListImages(self._get_ids_image, self._get_image) @@ -786,24 +770,85 @@ def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: raise PdfReadError(f"Cannot find resource entry {v} for {k}") return v - def _get_inline_images(self) -> dict[str, ImageFile]: - """Load inline images. Entries will be identified as `~1~`.""" + def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: + """Load images from content stream. Includes both inline images and Do-referenced images. + + This method scans the page content stream and extracts: + + 1. **Inline images** (~0~, ~1~...): Embedded directly in content stream via BI/EI operators + - is_inline=True, is_displayed=True, indirect_reference=None + + 2. **Do-referenced images** (/Im0, /Im1...): Referenced via "Do" operator + - is_inline=False, is_displayed=True, indirect_reference= + + 3. **Pure XObject images** (/I0, /Image1...): Defined in Resources only (not in content stream) + - is_inline=False, is_displayed=False, indirect_reference= + + Returns: + Dictionary mapping image names to ImageFile instances. + """ + # Idempotent: if already parsed, return cached result + if self.displayed_images is not None: + return self.displayed_images + content = self.get_contents() if is_null_or_none(content): - return {} + self.displayed_images = {} + return self.displayed_images imgs_data = [] + do_image_names: list[bytes] = [] assert content is not None, "mypy" for param, ope in content.operations: if ope == b"INLINE IMAGE": imgs_data.append( {"settings": param["settings"], "__streamdata__": param["data"]} ) + elif ope == b"Do" and param: + do_image_names.append(param[0]) # First operand is the XObject name elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover raise PdfReadError( f"{ope!r} operator met whereas not expected, " "please share use case with pypdf dev team" ) + # Process Do-referenced images first files = {} + xobjs: Optional[DictionaryObject] = None + try: + resources = cast(DictionaryObject, self[PG.RESOURCES]) + xobjs = cast(DictionaryObject, resources[RES.XOBJECT]) + except KeyError: + pass # Continue with inline images only + + if xobjs is None: + # No XOBJECT resources, skip Do-referenced images + pass + else: + for do_name in do_image_names: + try: + # Handle both NameObject (str) and bytes + if isinstance(do_name, bytes): + do_name_str = do_name.decode() + else: + do_name_str = str(do_name) + xobj = xobjs[do_name] + # Only process if it's an actual image, not a form + if isinstance(xobj, DictionaryObject) and str(xobj[IA.SUBTYPE]) == "/Image": + from .generic._image_xobject import _xobj_to_image as _xobj_to_image2 # noqa: PLC0415 + imgd = _xobj_to_image2(xobj) + extension, byte_stream, img = imgd + img_file = ImageFile( + name=f"{do_name_str.lstrip('/')}{extension}", + data=byte_stream, + image=img, + indirect_reference=xobj.indirect_reference, + is_inline=False, + is_displayed=True, # Do-referenced images are always displayed + ) + files[do_name_str] = img_file + except KeyError: + continue + + # Then process inline images for num, ii in enumerate(imgs_data): init = { "__streamdata__": ii["__streamdata__"], @@ -829,8 +874,12 @@ def _get_inline_images(self) -> dict[str, ImageFile]: data=byte_stream, image=img, indirect_reference=None, + is_inline=True, + is_displayed=True, ) - return files + + self.displayed_images = files + return self.displayed_images @property def rotation(self) -> int: @@ -1115,7 +1164,7 @@ def replace_contents( # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content # forces recalculation of inline_images - self.inline_images = None + self.displayed_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True From e7f78cf5a4c1c243da634cc8d1e239ef3a416f38 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Thu, 30 Apr 2026 23:12:26 +0200 Subject: [PATCH 14/32] update references to _parse_images_from_content_stream --- tests/test_images.py | 10 +++++----- tests/test_workflows.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index e06279bd70..29911d20e2 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -231,7 +231,7 @@ def test_get_inline_image_without_xobject_resources(): page = PageObject(None, None) inline_image = type("Mock", (), {"is_inline": True, "is_displayed": True})() - with mock.patch.object(page, "_get_inline_images", return_value={"~0~": inline_image}): + with mock.patch.object(page, "_parse_images_from_content_stream", return_value={"~0~": inline_image}): assert page._get_image("~0~") is inline_image @@ -239,7 +239,7 @@ def test_get_inline_image_without_xobject_resources_raises_when_missing(): page = PageObject(None, None) with ( - mock.patch.object(page, "_get_inline_images", return_value=None), + mock.patch.object(page, "_parse_images_from_content_stream", return_value=None), pytest.raises(KeyError, match="No inline image can be found"), ): page._get_image("~0~") @@ -442,9 +442,9 @@ def test_inline_image_extraction(): assert image_similarity(writer.pages[0].images[i].image, img) == 1 writer.pages[0].extract_text() # check recalculation of inline images - assert writer.pages[0].inline_images is not None + assert writer.pages[0].displayed_images is not None writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) - assert writer.pages[0].inline_images is None + assert writer.pages[0].displayed_images is None reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") writer.pages[0].merge_page(reader.pages[0]) assert list(writer.pages[0].images.keys()) == [ @@ -556,7 +556,7 @@ def test_contentstream__read_inline_image__fallback_is_successful(): ) page = PageObject(pdf=None) with mock.patch.object(page, "get_contents", return_value=stream): - images = page._get_inline_images() + images = page._parse_images_from_content_stream() assert list(images) == ["~0~"] assert images["~0~"].data == ( b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x0f" diff --git a/tests/test_workflows.py b/tests/test_workflows.py index e23547269a..d2f89a9743 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1012,7 +1012,7 @@ def test_inline_images(): with pytest.raises(KeyError): reader.pages[0].images["~999~"] del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] - reader.pages[1].inline_images = None # to force recalculation + reader.pages[1].displayed_images = None # to force recalculation with pytest.raises(PdfReadError): reader.pages[1].images["~1~"] From f0de97ddc5f2deee27558f78073f3d0f04fab894 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Wed, 6 May 2026 19:35:31 +0200 Subject: [PATCH 15/32] fix conflict with main --- pypdf/_page.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 15da6ff3ef..6c941810ae 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -58,9 +58,8 @@ logger_warning, matrix_multiply, ) -from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING +from .constants import _INLINE_IMAGE_KEY_MAPPING, _INLINE_IMAGE_VALUE_MAPPING, ImageAttributes from .constants import AnnotationDictionaryAttributes as ADA -from .constants import ImageAttributes as IA from .constants import PageAttributes as PG from .constants import Resources as RES from .errors import PageSizeNotDefinedError, PdfReadError @@ -635,7 +634,7 @@ def _get_ids_image( continue # Check if this XObject is an Image - if x_object[o][IA.SUBTYPE] == "/Image": + if x_object[o][ImageAttributes.SUBTYPE] == "/Image": # Add the image ID (with ancestry if needed) # When ancest is empty, o is top-level: "/I0" # When ancest is not empty, [ancest, o] is nested: ["/Form1", "/I0"] @@ -832,7 +831,7 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: do_name_str = str(do_name) xobj = xobjs[do_name] # Only process if it's an actual image, not a form - if isinstance(xobj, DictionaryObject) and str(xobj[IA.SUBTYPE]) == "/Image": + if isinstance(xobj, DictionaryObject) and str(xobj[ImageAttributes.SUBTYPE]) == "/Image": from .generic._image_xobject import _xobj_to_image as _xobj_to_image2 # noqa: PLC0415 imgd = _xobj_to_image2(xobj) extension, byte_stream, img = imgd From 54d6dd2c75172e4bc03fdf5205d0035499655430 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 19:22:25 +0200 Subject: [PATCH 16/32] update sample files --- sample-files | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-files b/sample-files index 8c405ece5e..f1d7699748 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 8c405ece5eff12396a34a1fae3276132002e1753 +Subproject commit f1d76997481db978d22c1bbe6580056fae7e9088 From 6db13895a11e43aa1f9c322eaabe3dfb0d903937 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 19:37:57 +0200 Subject: [PATCH 17/32] add _displayed_images test file --- sample-files | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-files b/sample-files index f1d7699748..4c08ef37ce 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit f1d76997481db978d22c1bbe6580056fae7e9088 +Subproject commit 4c08ef37ce9dbedf7887b93e470ce5e98dedc5df From f0c7a727b815d66d9ff3f52ad188c78aad8ac99a Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 19:45:19 +0200 Subject: [PATCH 18/32] make _displayed_images private, deprecate inline_images and derive it from _displayed_images --- pypdf/_page.py | 76 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index eb9303294d..1a2edf524f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -55,6 +55,7 @@ TransformationMatrixType, _human_readable_bytes, deprecate, + deprecate_with_replacement, logger_warning, matrix_multiply, ) @@ -524,7 +525,7 @@ def __init__( ) -> None: DictionaryObject.__init__(self) self.pdf = pdf - self.displayed_images: Optional[dict[str, ImageFile]] = None + self._displayed_images: Optional[dict[str, ImageFile]] = None self.indirect_reference = indirect_reference if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" @@ -620,8 +621,8 @@ def _get_ids_image( if _i in call_stack: return [] call_stack.append(_i) - if self.displayed_images is None: - self.displayed_images = self._parse_images_from_content_stream() + if self._displayed_images is None: + self._displayed_images = self._parse_images_from_content_stream() if obj is None: obj = self if ancest is None: @@ -632,7 +633,7 @@ def _get_ids_image( is_null_or_none(resources := obj[PG.RESOURCES]) or RES.XOBJECT not in cast(DictionaryObject, resources) ): - return [] if self.displayed_images is None else list(self.displayed_images.keys()) + return [] if self._displayed_images is None else list(self._displayed_images.keys()) x_object = resources[RES.XOBJECT].get_object() # type: ignore @@ -663,7 +664,7 @@ def _get_ids_image( # Add inline images (they may overlap with XObject images) # Preserves order # Inline images have names starting with ~ (e.g., ~0~, ~1~) - for k in self.displayed_images: + for k in self._displayed_images: if k not in deduplicated: deduplicated.append(k) @@ -692,19 +693,19 @@ def _get_image( ) from exc if isinstance(id, str): if id[0] == "~" and id[-1] == "~": - if self.displayed_images is None: - self.displayed_images = self._parse_images_from_content_stream() - if self.displayed_images is None: + if self._displayed_images is None: + self._displayed_images = self._parse_images_from_content_stream() + if self._displayed_images is None: raise KeyError("No inline image can be found") - img = self.displayed_images[id] + img = self._displayed_images[id] img.is_inline = True img.is_displayed = True return img assert xobjs is not None # Check if image is in content stream (from _parse_images_from_content_stream) - if self.displayed_images and id in self.displayed_images: - img = self.displayed_images[id] + if self._displayed_images and id in self._displayed_images: + img = self._displayed_images[id] img.is_inline = False return img @@ -763,6 +764,45 @@ def images(self) -> VirtualListImages: """ return VirtualListImages(self._get_ids_image, self._get_image) + @property + def inline_images(self) -> Optional[dict[str, ImageFile]]: + """ + Return only inline images from the page. + + .. deprecated:: + Use :attr:`images` and filter by :attr:`ImageFile.is_inline` instead. + This property will be removed in pypdf 7.0. + + Examples: + >>> from pypdf import PdfReader + >>> reader = PdfReader("example.pdf") + >>> page = reader.pages[0] + >>> inline_images = page.inline_images # Deprecated + """ + deprecate_with_replacement( + "PageObject.inline_images", + "PageObject.images", + "7.0", + ) + if self._displayed_images is None: + return None + return {k: v for k, v in self._displayed_images.items() if v.is_inline} + + @inline_images.setter + def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: + """ + Setter for inline_images. + + Setting to None clears the cache and forces recalculation on next access, + emulating the previous caching control mechanism. + """ + if value is None: + self._displayed_images = None + else: + if self._displayed_images is None: + self._displayed_images = {} + self._displayed_images.update(value) + def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: """Translate values used in inline image""" try: @@ -796,13 +836,13 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: Dictionary mapping image names to ImageFile instances. """ # Idempotent: if already parsed, return cached result - if self.displayed_images is not None: - return self.displayed_images + if self._displayed_images is not None: + return self._displayed_images content = self.get_contents() if is_null_or_none(content): - self.displayed_images = {} - return self.displayed_images + self._displayed_images = {} + return self._displayed_images imgs_data = [] do_image_names: list[bytes] = [] assert content is not None, "mypy" @@ -886,8 +926,8 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: is_displayed=True, ) - self.displayed_images = files - return self.displayed_images + self._displayed_images = files + return self._displayed_images @property def rotation(self) -> int: @@ -1172,7 +1212,7 @@ def replace_contents( # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content # forces recalculation of inline_images - self.displayed_images = None + self._displayed_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True From 18ebf94da2516866d48191869ec2b80fbab47d9b Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 19:45:42 +0200 Subject: [PATCH 19/32] update _displayed_images references --- tests/test_images.py | 4 ++-- tests/test_workflows.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index 894b16c1ec..21ea50c685 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -442,9 +442,9 @@ def test_inline_image_extraction(): assert image_similarity(writer.pages[0].images[i].image, img) == 1 writer.pages[0].extract_text() # check recalculation of inline images - assert writer.pages[0].displayed_images is not None + assert writer.pages[0]._displayed_images is not None writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) - assert writer.pages[0].displayed_images is None + assert writer.pages[0]._displayed_images is None reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") writer.pages[0].merge_page(reader.pages[0]) assert list(writer.pages[0].images.keys()) == [ diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 782fe73be6..82cef3f7aa 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1032,7 +1032,7 @@ def test_inline_images(): with pytest.raises(KeyError): reader.pages[0].images["~999~"] del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] - reader.pages[1].displayed_images = None # to force recalculation + reader.pages[1]._displayed_images = None # to force recalculation with pytest.raises(PdfReadError): reader.pages[1].images["~1~"] From 983022fec9a2af4a4c415528f46c5ccf13fc48fa Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 19:53:01 +0200 Subject: [PATCH 20/32] update inline_images references --- pypdf/_page.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 1a2edf524f..46a219dcd6 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -777,7 +777,7 @@ def inline_images(self) -> Optional[dict[str, ImageFile]]: >>> from pypdf import PdfReader >>> reader = PdfReader("example.pdf") >>> page = reader.pages[0] - >>> inline_images = page.inline_images # Deprecated + >>> inline_images = {k: v for k, v in page.images.items() if v.is_inline} """ deprecate_with_replacement( "PageObject.inline_images", @@ -794,7 +794,8 @@ def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: Setter for inline_images. Setting to None clears the cache and forces recalculation on next access, - emulating the previous caching control mechanism. + emulating the previous caching control mechanism. Setting to a dict merges + the values into the existing cache. """ if value is None: self._displayed_images = None @@ -1211,7 +1212,7 @@ def replace_contents( # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content - # forces recalculation of inline_images + # forces recalculation of images self._displayed_images = None def merge_page( From d6b7ff45d1eead36d5b70f6e329a59b34bea4982 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Sat, 16 May 2026 20:09:37 +0200 Subject: [PATCH 21/32] update some image paths --- tests/test_images.py | 68 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 21ea50c685..7ecdced61c 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -5,6 +5,7 @@ and/or the actual image data with the expected value. """ +import warnings from io import BytesIO from pathlib import Path from typing import Union @@ -663,7 +664,7 @@ def test_get_ids_image__resources_is_none(): @pytest.mark.samples def test_is_xobject_image_displayed(): """Test XObject image display detection with expected results.""" - path = SAMPLE_ROOT / "027-image-references-deduplication/wrong-references.pdf" + path = SAMPLE_ROOT / "028-image-references-deduplication/wrong-references.pdf" reader = PdfReader(path) expected_results = [ @@ -697,3 +698,68 @@ def test_is_inline_image_displayed(): page = reader.pages[page_num] img = page.images[image_id] assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" + + +@pytest.mark.samples +def test_inline_images_property_deprecation_warning(): + """Test that inline_images property emits a deprecation warning.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + _ = page.inline_images + assert len(w) == 1 + assert issubclass(w[0].category, DeprecationWarning) + assert "inline_images" in str(w[0].message) + assert "images" in str(w[0].message) + + +@pytest.mark.samples +def test_inline_images_property_returns_only_inline(): + """Test that inline_images returns only images with is_inline=True.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + inline = page.inline_images + if inline is not None: + for k, v in inline.items(): + assert v.is_inline is True, f"Image {k} should have is_inline=True" + + +@pytest.mark.samples +def test_inline_images_setter_clears_cache(): + """Test that setting inline_images to None clears the cache.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + # Force cache population by accessing images + _ = list(page.images) + assert page._displayed_images is not None + + # Clear cache via setter + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + page.inline_images = None + assert page._displayed_images is None + + +@pytest.mark.samples +def test_inline_images_setter_merges(): + """Test that setting inline_images to a dict merges into the cache.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + # Force cache population by accessing images + _ = list(page.images) + original_keys = set(page._displayed_images.keys()) + + # Merge new values + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + page.inline_images = {"new_key": page.images[0]} + merged_keys = set(page._displayed_images.keys()) + assert original_keys.issubset(merged_keys), "Original keys should be preserved" + assert "new_key" in merged_keys, "New key should be added" From 6f0aa8b2c14b18ac63bff122e1d535ef96b38cd8 Mon Sep 17 00:00:00 2001 From: Andrea Santoro Date: Mon, 18 May 2026 17:55:01 +0200 Subject: [PATCH 22/32] Update tests/test_images.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_images.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_images.py b/tests/test_images.py index 7ecdced61c..c8a40dcddd 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -683,6 +683,7 @@ def test_is_xobject_image_displayed(): img = reader.pages[page_num].images[image_id] assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" + @pytest.mark.samples def test_is_inline_image_displayed(): """This test ensures that displayed inline images are detected by `ImageFile.is_displayed`""" From 183e10f8e98c97644c88893b03bfb1aca20b89f3 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 17:57:15 +0200 Subject: [PATCH 23/32] rename _displayed_images to _content_stream_images --- pypdf/_page.py | 48 ++++++++++++++++++++--------------------- tests/test_images.py | 12 +++++------ tests/test_workflows.py | 2 +- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 46a219dcd6..b45ebb415f 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -525,7 +525,7 @@ def __init__( ) -> None: DictionaryObject.__init__(self) self.pdf = pdf - self._displayed_images: Optional[dict[str, ImageFile]] = None + self._content_stream_images: Optional[dict[str, ImageFile]] = None self.indirect_reference = indirect_reference if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" @@ -621,8 +621,8 @@ def _get_ids_image( if _i in call_stack: return [] call_stack.append(_i) - if self._displayed_images is None: - self._displayed_images = self._parse_images_from_content_stream() + if self._content_stream_images is None: + self._content_stream_images = self._parse_images_from_content_stream() if obj is None: obj = self if ancest is None: @@ -633,7 +633,7 @@ def _get_ids_image( is_null_or_none(resources := obj[PG.RESOURCES]) or RES.XOBJECT not in cast(DictionaryObject, resources) ): - return [] if self._displayed_images is None else list(self._displayed_images.keys()) + return [] if self._content_stream_images is None else list(self._content_stream_images.keys()) x_object = resources[RES.XOBJECT].get_object() # type: ignore @@ -664,7 +664,7 @@ def _get_ids_image( # Add inline images (they may overlap with XObject images) # Preserves order # Inline images have names starting with ~ (e.g., ~0~, ~1~) - for k in self._displayed_images: + for k in self._content_stream_images: if k not in deduplicated: deduplicated.append(k) @@ -693,19 +693,19 @@ def _get_image( ) from exc if isinstance(id, str): if id[0] == "~" and id[-1] == "~": - if self._displayed_images is None: - self._displayed_images = self._parse_images_from_content_stream() - if self._displayed_images is None: + if self._content_stream_images is None: + self._content_stream_images = self._parse_images_from_content_stream() + if self._content_stream_images is None: raise KeyError("No inline image can be found") - img = self._displayed_images[id] + img = self._content_stream_images[id] img.is_inline = True img.is_displayed = True return img assert xobjs is not None # Check if image is in content stream (from _parse_images_from_content_stream) - if self._displayed_images and id in self._displayed_images: - img = self._displayed_images[id] + if self._content_stream_images and id in self._content_stream_images: + img = self._content_stream_images[id] img.is_inline = False return img @@ -784,9 +784,9 @@ def inline_images(self) -> Optional[dict[str, ImageFile]]: "PageObject.images", "7.0", ) - if self._displayed_images is None: + if self._content_stream_images is None: return None - return {k: v for k, v in self._displayed_images.items() if v.is_inline} + return {k: v for k, v in self._content_stream_images.items() if v.is_inline} @inline_images.setter def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: @@ -798,11 +798,11 @@ def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: the values into the existing cache. """ if value is None: - self._displayed_images = None + self._content_stream_images = None else: - if self._displayed_images is None: - self._displayed_images = {} - self._displayed_images.update(value) + if self._content_stream_images is None: + self._content_stream_images = {} + self._content_stream_images.update(value) def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: """Translate values used in inline image""" @@ -837,13 +837,13 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: Dictionary mapping image names to ImageFile instances. """ # Idempotent: if already parsed, return cached result - if self._displayed_images is not None: - return self._displayed_images + if self._content_stream_images is not None: + return self._content_stream_images content = self.get_contents() if is_null_or_none(content): - self._displayed_images = {} - return self._displayed_images + self._content_stream_images = {} + return self._content_stream_images imgs_data = [] do_image_names: list[bytes] = [] assert content is not None, "mypy" @@ -927,8 +927,8 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: is_displayed=True, ) - self._displayed_images = files - return self._displayed_images + self._content_stream_images = files + return self._content_stream_images @property def rotation(self) -> int: @@ -1213,7 +1213,7 @@ def replace_contents( # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content # forces recalculation of images - self._displayed_images = None + self._content_stream_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True diff --git a/tests/test_images.py b/tests/test_images.py index c8a40dcddd..f2ae11d369 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -443,9 +443,9 @@ def test_inline_image_extraction(): assert image_similarity(writer.pages[0].images[i].image, img) == 1 writer.pages[0].extract_text() # check recalculation of inline images - assert writer.pages[0]._displayed_images is not None + assert writer.pages[0]._content_stream_images is not None writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) - assert writer.pages[0]._displayed_images is None + assert writer.pages[0]._content_stream_images is None reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") writer.pages[0].merge_page(reader.pages[0]) assert list(writer.pages[0].images.keys()) == [ @@ -738,13 +738,13 @@ def test_inline_images_setter_clears_cache(): # Force cache population by accessing images _ = list(page.images) - assert page._displayed_images is not None + assert page._content_stream_images is not None # Clear cache via setter with warnings.catch_warnings(): warnings.simplefilter("ignore") page.inline_images = None - assert page._displayed_images is None + assert page._content_stream_images is None @pytest.mark.samples @@ -755,12 +755,12 @@ def test_inline_images_setter_merges(): # Force cache population by accessing images _ = list(page.images) - original_keys = set(page._displayed_images.keys()) + original_keys = set(page._content_stream_images.keys()) # Merge new values with warnings.catch_warnings(): warnings.simplefilter("ignore") page.inline_images = {"new_key": page.images[0]} - merged_keys = set(page._displayed_images.keys()) + merged_keys = set(page._content_stream_images.keys()) assert original_keys.issubset(merged_keys), "Original keys should be preserved" assert "new_key" in merged_keys, "New key should be added" diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 82cef3f7aa..cdf870371b 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1032,7 +1032,7 @@ def test_inline_images(): with pytest.raises(KeyError): reader.pages[0].images["~999~"] del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] - reader.pages[1]._displayed_images = None # to force recalculation + reader.pages[1]._content_stream_images = None # to force recalculation with pytest.raises(PdfReadError): reader.pages[1].images["~1~"] From ccf4a9ddb417a2a59a6630caa0ea218b0c94f88b Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 18:09:04 +0200 Subject: [PATCH 24/32] remove wrong docstring --- pypdf/_page.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index b45ebb415f..24c617ff97 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -741,22 +741,6 @@ def images(self) -> VirtualListImages: * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form * `for img in reader.pages[0].images:` # loops through all objects - images.keys() and images.items() can be used. - - The ImageFile has the following properties: - - * `.name` : name of the object - * `.data` : bytes of the object - * `.image` : PIL Image Object - * `.indirect_reference` : object reference (None for inline images) - * `.is_inline` : True for inline images (~0~, ~1~...), False for XObjects - * `.is_displayed` : True for images found in content stream, False otherwise - - and the following methods: - `.replace(new_image: PIL.Image.Image, **kwargs)` : - replace the image in the pdf with the new image - applying the saving parameters indicated (such as quality) - Example usage: reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) From 42c1f819950c8c493909424d3be188b9e34a981e Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 18:10:10 +0200 Subject: [PATCH 25/32] add deprecation notice to inline_images setter --- pypdf/_page.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index 24c617ff97..7d4fa091f7 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -780,6 +780,10 @@ def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: Setting to None clears the cache and forces recalculation on next access, emulating the previous caching control mechanism. Setting to a dict merges the values into the existing cache. + + .. deprecated:: + Use :attr:`images` and filter by :attr:`ImageFile.is_inline` instead. + This property will be removed in pypdf 7.0. """ if value is None: self._content_stream_images = None From 364ccbf26bea0dedcf491df035135156d20803af Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 18:20:17 +0200 Subject: [PATCH 26/32] remove unneeded cache setter --- pypdf/_page.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 7d4fa091f7..23c1b98058 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -696,7 +696,7 @@ def _get_image( if self._content_stream_images is None: self._content_stream_images = self._parse_images_from_content_stream() if self._content_stream_images is None: - raise KeyError("No inline image can be found") + raise KeyError("No image can be found") img = self._content_stream_images[id] img.is_inline = True img.is_displayed = True @@ -915,8 +915,7 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: is_displayed=True, ) - self._content_stream_images = files - return self._content_stream_images + return files @property def rotation(self) -> int: From 683d5d43fe6e98262cb62f02dbd30ad112e30b82 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 18:27:49 +0200 Subject: [PATCH 27/32] use regular mock instead of type --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index f2ae11d369..0a74b9815e 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -230,7 +230,7 @@ def test_image_extraction(src, page_index, image_key, expected): def test_get_inline_image_without_xobject_resources(): page = PageObject(None, None) - inline_image = type("Mock", (), {"is_inline": True, "is_displayed": True})() + inline_image = mock.Mock(is_inline=True, is_displayed=True) with mock.patch.object(page, "_parse_images_from_content_stream", return_value={"~0~": inline_image}): assert page._get_image("~0~") is inline_image From 70963f678e707ac9733f5da87d54b76cf438b35c Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 18:30:07 +0200 Subject: [PATCH 28/32] remove unneeded cache setter --- pypdf/_page.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 23c1b98058..f1bf570273 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -830,8 +830,7 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: content = self.get_contents() if is_null_or_none(content): - self._content_stream_images = {} - return self._content_stream_images + return {} imgs_data = [] do_image_names: list[bytes] = [] assert content is not None, "mypy" From 439fab3ca250daa292b6af02e0c902f0bf987381 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 19:55:16 +0200 Subject: [PATCH 29/32] fix key error message in test_get_inline_image_without_xobject_resources_raises_when_missing --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 0a74b9815e..3aa7d306fd 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -241,7 +241,7 @@ def test_get_inline_image_without_xobject_resources_raises_when_missing(): with ( mock.patch.object(page, "_parse_images_from_content_stream", return_value=None), - pytest.raises(KeyError, match="No inline image can be found"), + pytest.raises(KeyError, match="No image can be found"), ): page._get_image("~0~") From e4ea2413ac613a96c780e67d9759417cd1a201b1 Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 20:13:00 +0200 Subject: [PATCH 30/32] invalidate cache after manipulating images --- tests/test_filters.py | 1 + tests/test_images.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/test_filters.py b/tests/test_filters.py index 93f7fd8d81..d6aef434dc 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -589,6 +589,7 @@ def test_jpx_no_spacecode(): # create an object without filter and without colorspace # just for coverage del im.indirect_reference.get_object()["/Filter"] + reader.pages[0]._content_stream_images = None # invalidate cache with pytest.raises(PdfReadError) as exc: reader.pages[0].images[0] assert exc.value.args[0].startswith("ColorSpace field not found") diff --git a/tests/test_images.py b/tests/test_images.py index 3aa7d306fd..dbcc105ef9 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -312,6 +312,7 @@ def test_separation_1byte_to_rgb_inverted(): assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 obj = reader.pages[0].images[0].indirect_reference.get_object() obj.set_data(obj.get_data() + b"\x00") + reader.pages[0]._content_stream_images = None # invalidate cache with pytest.raises(ValueError): reader.pages[0].images[0] From 38eebdbb41edcf42dd267c34c39f840af7ddfefc Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Mon, 18 May 2026 22:49:35 +0200 Subject: [PATCH 31/32] emit warnings for image read errors instead of crashing --- pypdf/_page.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pypdf/_page.py b/pypdf/_page.py index f1bf570273..48ab5f6343 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -883,6 +883,14 @@ def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: files[do_name_str] = img_file except KeyError: continue + except OSError as e: + logger_warning( + "Failed loading image %(image_name)s: %(exception)s", + source=__name__, + image_name=do_name_str, + exception=e, + ) + continue # Then process inline images for num, ii in enumerate(imgs_data): From bb11c8c132c578b6fdfb7552c79a5eaeee7c7a9d Mon Sep 17 00:00:00 2001 From: "andreasantoro.pvt@gmail.com" Date: Tue, 19 May 2026 17:55:25 +0200 Subject: [PATCH 32/32] remove abbreviations --- pypdf/_page.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 48ab5f6343..03a67cd707 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -770,7 +770,11 @@ def inline_images(self) -> Optional[dict[str, ImageFile]]: ) if self._content_stream_images is None: return None - return {k: v for k, v in self._content_stream_images.items() if v.is_inline} + return { + image_name: image_file + for image_name, image_file in self._content_stream_images.items() + if image_file.is_inline + } @inline_images.setter def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: