Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
57390d1
add is_displayed_on_page function with caching
andreasntr Apr 20, 2026
ff16713
remove unneeded castings
andreasntr Apr 20, 2026
cf232f9
comply with linter
andreasntr Apr 20, 2026
4d1ca4e
comply with linter
andreasntr Apr 20, 2026
672ad47
remove example
andreasntr Apr 20, 2026
069d5c5
add minimal test
andreasntr Apr 20, 2026
ff80e2e
comply with linter
andreasntr Apr 20, 2026
dd2ac2e
fix docstring and pdf path in test_is_xobject_image_displayed, add py…
andreasntr Apr 21, 2026
27fc2bb
switch from page_number to page as is_displayed_on_page input
andreasntr Apr 21, 2026
5f59487
temporarily remove is_displayed_on_page caching
andreasntr Apr 21, 2026
a09a6bb
Merge branch 'main' into main
andreasntr Apr 22, 2026
58c75a6
switch display check to image constructor
andreasntr Apr 22, 2026
2966ee2
fix tests to use the new is_displayed property
andreasntr Apr 22, 2026
3bcf9a5
Merge branch 'main' into main
andreasntr Apr 26, 2026
14a56d7
move image displayed check to page initialization
andreasntr Apr 30, 2026
e7f78cf
update references to _parse_images_from_content_stream
andreasntr Apr 30, 2026
b2a6114
Merge branch 'main' into main
andreasntr May 1, 2026
f0de97d
fix conflict with main
andreasntr May 6, 2026
6cad13a
Merge branch 'main' into main
andreasntr May 6, 2026
69cb462
Merge branch 'main' into main
andreasntr May 16, 2026
54d6dd2
update sample files
andreasntr May 16, 2026
6db1389
add _displayed_images test file
andreasntr May 16, 2026
f0c7a72
make _displayed_images private, deprecate inline_images and derive it…
andreasntr May 16, 2026
18ebf94
update _displayed_images references
andreasntr May 16, 2026
983022f
update inline_images references
andreasntr May 16, 2026
d6b7ff4
update some image paths
andreasntr May 16, 2026
973f345
Merge branch 'main' into main
andreasntr May 18, 2026
6f0aa8b
Update tests/test_images.py
andreasntr May 18, 2026
183e10f
rename _displayed_images to _content_stream_images
andreasntr May 18, 2026
ccf4a9d
remove wrong docstring
andreasntr May 18, 2026
42c1f81
add deprecation notice to inline_images setter
andreasntr May 18, 2026
364ccbf
remove unneeded cache setter
andreasntr May 18, 2026
683d5d4
use regular mock instead of type
andreasntr May 18, 2026
70963f6
remove unneeded cache setter
andreasntr May 18, 2026
439fab3
fix key error message in test_get_inline_image_without_xobject_resour…
andreasntr May 18, 2026
e4ea241
invalidate cache after manipulating images
andreasntr May 18, 2026
38eebdb
emit warnings for image read errors instead of crashing
andreasntr May 18, 2026
bb11c8c
remove abbreviations
andreasntr May 19, 2026
00411b1
Merge branch 'main' into main
andreasntr May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,87 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return self.__str__()[:-1] + f", hash: {hash(self.data)})"

def is_displayed_on_page(self, page: "PageObject") -> bool:
"""
Check if this image is displayed on the specified page.

This method determines whether an image is actually rendered on a page
(not just referenced in resources). It checks the page's content stream
for image operators.

Args:
page: The page object to check.

Returns:
True if the image is displayed on the page, False otherwise.
"""
# Check if this is an inline image or XObject image
# Inline images have names starting with "~"
if self.name.startswith("~"):
result = self._check_inline_image_displayed(page)
else:
result = self._check_xobject_image_displayed(page)

return result

def _check_inline_image_displayed(self, page: "PageObject") -> bool:
"""
Check if an inline image is displayed on a page.

Inline images appear in the content stream as "INLINE IMAGE" operators.
The image name starts with "~" and is the first operand of the operator.

Args:
page: The page to check.

Returns:
True if the inline image is displayed on the page.
"""
image_name = self.name.split(".")[0]

if page.inline_images:
return len(list(filter(lambda i: i == image_name, page.inline_images.keys()))) > 0
return False

def _check_xobject_image_displayed(self, page: "PageObject") -> bool:
"""
Check if an XObject image is displayed on a page.

XObject images appear in the content stream as "Do" operators.
The image name is the first operand of the Do operator.
The name may have a leading "/" that needs to be stripped.

Args:
page: The page to check.

Returns:
True if the XObject image is displayed on the page.
"""
from .generic._data_structures import ContentStream # noqa: PLC0415

try:
if not self.indirect_reference:
return False

raw_contents = page.get(NameObject("/Contents"), None)

stream = ContentStream(raw_contents, self.indirect_reference.pdf)

for operands, operator in stream.operations:
# First operand is the XObject name (may have leading /)
if operator == b"Do" and operands:
xobj_name = str(operands[0])
# Compare base names (without extension like .jp2)
img_base = self.name.split(".")[0].lstrip("/")
xobj_base = xobj_name.lstrip("/")

if img_base == xobj_base:
return True
except (KeyError, IndexError, AttributeError):
pass

return False


class VirtualListImages(Sequence[ImageFile]):
"""
Expand Down
49 changes: 49 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,3 +658,52 @@ def test_get_ids_image__resources_is_none():
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[2]
assert list(page.images.items()) == []


@pytest.mark.samples
def test_is_xobject_image_displayed():
"""
This test ensures that only actually displayed referenced images
are detected by `ImageFile.is_displayed_on_page`
"""
path = SAMPLE_ROOT / "027-image-references-deduplication/wrong-references.pdf"
reader = PdfReader(path)
pages = reader.pages
page_1, page_2, page_3 = pages

# Page 1: Im8.jp2 displayed, Im20.jp2 not displayed
# Page 2: Neither displayed
# Page 3: Im20.jp2 displayed, Im8.jp2 not displayed
expected_results = [
(page_1, 0, "/Im20", False),
(page_1, 0, "/Im8", True),
(page_2, 1, "/Im20", False),
(page_2, 1, "/Im8", False),
(page_3, 2, "/Im20", True),
(page_3, 2, "/Im8", False),
]

for page, page_num, image_id, expected in expected_results:
img = page.images[image_id]
is_used = img.is_displayed_on_page(page)
assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}"
assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}"

Comment thread
andreasntr marked this conversation as resolved.
@pytest.mark.samples
def test_is_inline_image_displayed():
"""This test ensures that displayed inline images are detected by `ImageFile.is_displayed_on_page`"""
path = SAMPLE_ROOT / "008-reportlab-inline-image/inline-image.pdf"
reader = PdfReader(path)
pages = reader.pages
page_1 = pages[0]

# Page 1:
expected_results = [
(page_1, 0, "~0~", True),
]

for page, page_num, image_id, expected in expected_results:
img = page.images[image_id]
is_used = img.is_displayed_on_page(page)
assert isinstance(is_used, bool), f"is_displayed_on_page() must return bool for {image_id}"
assert is_used == expected, f"Page {page_num}: {image_id} expected {expected}, got {is_used}"
Loading