diff --git a/pypdf/_page.py b/pypdf/_page.py index 55cbeab96..03a67cd70 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -55,6 +55,7 @@ TransformationMatrixType, _human_readable_bytes, deprecate, + deprecate_with_replacement, logger_warning, matrix_multiply, ) @@ -359,6 +360,18 @@ class ImageFile: Reference to the object storing the stream. """ + is_inline: bool = False + """ + True if this is an inline image (~0~, ~1~, etc.). + """ + + is_displayed: bool = False + """ + True if this image is displayed in the page content stream. + Some PDFs duplicate image references over all the pages, + so this is needed to disambiguate. + """ + def replace(self, new_image: Image, **kwargs: Any) -> None: """ Replace the image with a new PIL image. @@ -512,7 +525,7 @@ def __init__( ) -> None: DictionaryObject.__init__(self) self.pdf = pdf - self.inline_images: Optional[dict[str, ImageFile]] = None + self._content_stream_images: Optional[dict[str, ImageFile]] = None self.indirect_reference = indirect_reference if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" @@ -608,8 +621,8 @@ def _get_ids_image( if _i in call_stack: return [] call_stack.append(_i) - if self.inline_images is None: - self.inline_images = self._get_inline_images() + if self._content_stream_images is None: + self._content_stream_images = self._parse_images_from_content_stream() if obj is None: obj = self if ancest is None: @@ -620,19 +633,42 @@ def _get_ids_image( is_null_or_none(resources := obj[PG.RESOURCES]) or RES.XOBJECT not in cast(DictionaryObject, resources) ): - return [] if self.inline_images is None else list(self.inline_images.keys()) + return [] if self._content_stream_images is None else list(self._content_stream_images.keys()) x_object = resources[RES.XOBJECT].get_object() # type: ignore + + # Iterate through all XObject resources for o in x_object: + # Skip non-stream objects (only process StreamObject) if not isinstance(x_object[o], StreamObject): continue + + # Check if this XObject is an Image if x_object[o][ImageAttributes.SUBTYPE] == "/Image": + # Add the image ID (with ancestry if needed) + # When ancest is empty, o is top-level: "/I0" + # When ancest is not empty, [ancest, o] is nested: ["/Form1", "/I0"] lst.append(o if len(ancest) == 0 else [*ancest, o]) - else: # is a form with possible images inside + + # If it's a form, recursively search for images inside it + else: + # Forms may contain images that are Do-referenced in their content stream lst.extend(self._get_ids_image(x_object[o], [*ancest, o], call_stack)) - assert self.inline_images is not None - lst.extend(list(self.inline_images.keys())) - return lst + + # Removes duplicates and preserves order + deduplicated = [] + for item in lst: + if item not in deduplicated: + deduplicated.append(item) + + # Add inline images (they may overlap with XObject images) + # Preserves order + # Inline images have names starting with ~ (e.g., ~0~, ~1~) + for k in self._content_stream_images: + if k not in deduplicated: + deduplicated.append(k) + + return deduplicated def _get_image( self, @@ -657,13 +693,22 @@ def _get_image( ) from exc if isinstance(id, str): if id[0] == "~" and id[-1] == "~": - if self.inline_images is None: - self.inline_images = self._get_inline_images() - if self.inline_images is None: - raise KeyError("No inline image can be found") - return self.inline_images[id] + if self._content_stream_images is None: + self._content_stream_images = self._parse_images_from_content_stream() + if self._content_stream_images is None: + raise KeyError("No image can be found") + img = self._content_stream_images[id] + img.is_inline = True + img.is_displayed = True + return img assert xobjs is not None + # Check if image is in content stream (from _parse_images_from_content_stream) + if self._content_stream_images and id in self._content_stream_images: + img = self._content_stream_images[id] + img.is_inline = False + return img + from .generic._image_xobject import _xobj_to_image # noqa: PLC0415 imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) extension, byte_stream = imgd[:2] @@ -672,6 +717,8 @@ def _get_image( data=byte_stream, image=imgd[2], indirect_reference=xobjs[id].indirect_reference, + is_inline=False, + is_displayed=False, # XObject images from resources only (not in content stream) ) # in a subobject assert xobjs is not None @@ -694,29 +741,60 @@ def images(self) -> VirtualListImages: * `reader.pages[0].images['/TP1','/Image1']` # return image '/Image1' within '/TP1' XObject form * `for img in reader.pages[0].images:` # loops through all objects - images.keys() and images.items() can be used. + Example usage: - The ImageFile has the following properties: + reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) - * `.name` : name of the object - * `.data` : bytes of the object - * `.image` : PIL Image Object - * `.indirect_reference` : object reference + """ + return VirtualListImages(self._get_ids_image, self._get_image) - and the following methods: - `.replace(new_image: PIL.Image.Image, **kwargs)` : - replace the image in the pdf with the new image - applying the saving parameters indicated (such as quality) + @property + def inline_images(self) -> Optional[dict[str, ImageFile]]: + """ + Return only inline images from the page. - Example usage: + .. deprecated:: + Use :attr:`images` and filter by :attr:`ImageFile.is_inline` instead. + This property will be removed in pypdf 7.0. - reader.pages[0].images[0].replace(Image.open("new_image.jpg"), quality=20) + Examples: + >>> from pypdf import PdfReader + >>> reader = PdfReader("example.pdf") + >>> page = reader.pages[0] + >>> inline_images = {k: v for k, v in page.images.items() if v.is_inline} + """ + deprecate_with_replacement( + "PageObject.inline_images", + "PageObject.images", + "7.0", + ) + if self._content_stream_images is None: + return None + return { + image_name: image_file + for image_name, image_file in self._content_stream_images.items() + if image_file.is_inline + } - Inline images are extracted and named ~0~, ~1~, ..., with the - indirect_reference set to None. + @inline_images.setter + def inline_images(self, value: Optional[dict[str, ImageFile]]) -> None: + """ + Setter for inline_images. + + Setting to None clears the cache and forces recalculation on next access, + emulating the previous caching control mechanism. Setting to a dict merges + the values into the existing cache. + .. deprecated:: + Use :attr:`images` and filter by :attr:`ImageFile.is_inline` instead. + This property will be removed in pypdf 7.0. """ - return VirtualListImages(self._get_ids_image, self._get_image) + if value is None: + self._content_stream_images = None + else: + if self._content_stream_images is None: + self._content_stream_images = {} + self._content_stream_images.update(value) def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: """Translate values used in inline image""" @@ -733,24 +811,92 @@ def _translate_value_inline_image(self, k: str, v: PdfObject) -> PdfObject: raise PdfReadError(f"Cannot find resource entry {v} for {k}") return v - def _get_inline_images(self) -> dict[str, ImageFile]: - """Load inline images. Entries will be identified as `~1~`.""" + def _parse_images_from_content_stream(self) -> dict[str, ImageFile]: + """Load images from content stream. Includes both inline images and Do-referenced images. + + This method scans the page content stream and extracts: + + 1. **Inline images** (~0~, ~1~...): Embedded directly in content stream via BI/EI operators + - is_inline=True, is_displayed=True, indirect_reference=None + + 2. **Do-referenced images** (/Im0, /Im1...): Referenced via "Do" operator + - is_inline=False, is_displayed=True, indirect_reference= + + 3. **Pure XObject images** (/I0, /Image1...): Defined in Resources only (not in content stream) + - is_inline=False, is_displayed=False, indirect_reference= + + Returns: + Dictionary mapping image names to ImageFile instances. + """ + # Idempotent: if already parsed, return cached result + if self._content_stream_images is not None: + return self._content_stream_images + content = self.get_contents() if is_null_or_none(content): return {} imgs_data = [] + do_image_names: list[bytes] = [] assert content is not None, "mypy" for param, ope in content.operations: if ope == b"INLINE IMAGE": imgs_data.append( {"settings": param["settings"], "__streamdata__": param["data"]} ) + elif ope == b"Do" and param: + do_image_names.append(param[0]) # First operand is the XObject name elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover raise PdfReadError( f"{ope!r} operator met whereas not expected, " "please share use case with pypdf dev team" ) + # Process Do-referenced images first files = {} + xobjs: Optional[DictionaryObject] = None + try: + resources = cast(DictionaryObject, self[PG.RESOURCES]) + xobjs = cast(DictionaryObject, resources[RES.XOBJECT]) + except KeyError: + pass # Continue with inline images only + + if xobjs is None: + # No XOBJECT resources, skip Do-referenced images + pass + else: + for do_name in do_image_names: + try: + # Handle both NameObject (str) and bytes + if isinstance(do_name, bytes): + do_name_str = do_name.decode() + else: + do_name_str = str(do_name) + xobj = xobjs[do_name] + # Only process if it's an actual image, not a form + if isinstance(xobj, DictionaryObject) and str(xobj[ImageAttributes.SUBTYPE]) == "/Image": + from .generic._image_xobject import _xobj_to_image as _xobj_to_image2 # noqa: PLC0415 + imgd = _xobj_to_image2(xobj) + extension, byte_stream, img = imgd + img_file = ImageFile( + name=f"{do_name_str.lstrip('/')}{extension}", + data=byte_stream, + image=img, + indirect_reference=xobj.indirect_reference, + is_inline=False, + is_displayed=True, # Do-referenced images are always displayed + ) + files[do_name_str] = img_file + except KeyError: + continue + except OSError as e: + logger_warning( + "Failed loading image %(image_name)s: %(exception)s", + source=__name__, + image_name=do_name_str, + exception=e, + ) + continue + + # Then process inline images for num, ii in enumerate(imgs_data): init = { "__streamdata__": ii["__streamdata__"], @@ -776,7 +922,10 @@ def _get_inline_images(self) -> dict[str, ImageFile]: data=byte_stream, image=img, indirect_reference=None, + is_inline=True, + is_displayed=True, ) + return files @property @@ -1061,8 +1210,8 @@ def replace_contents( # as a backup solution, we put content as an object although not in accordance with pdf ref # this will be fixed with the _add_object self[NameObject(PG.CONTENTS)] = content - # forces recalculation of inline_images - self.inline_images = None + # forces recalculation of images + self._content_stream_images = None def merge_page( self, page2: "PageObject", expand: bool = False, over: bool = True diff --git a/sample-files b/sample-files index 8c405ece5..4c08ef37c 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 8c405ece5eff12396a34a1fae3276132002e1753 +Subproject commit 4c08ef37ce9dbedf7887b93e470ce5e98dedc5df diff --git a/tests/test_filters.py b/tests/test_filters.py index 93f7fd8d8..d6aef434d 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -589,6 +589,7 @@ def test_jpx_no_spacecode(): # create an object without filter and without colorspace # just for coverage del im.indirect_reference.get_object()["/Filter"] + reader.pages[0]._content_stream_images = None # invalidate cache with pytest.raises(PdfReadError) as exc: reader.pages[0].images[0] assert exc.value.args[0].startswith("ColorSpace field not found") diff --git a/tests/test_images.py b/tests/test_images.py index c4f13b54b..dbcc105ef 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -5,6 +5,7 @@ and/or the actual image data with the expected value. """ +import warnings from io import BytesIO from pathlib import Path from typing import Union @@ -229,9 +230,9 @@ def test_image_extraction(src, page_index, image_key, expected): def test_get_inline_image_without_xobject_resources(): page = PageObject(None, None) - inline_image = object() + inline_image = mock.Mock(is_inline=True, is_displayed=True) - with mock.patch.object(page, "_get_inline_images", return_value={"~0~": inline_image}): + with mock.patch.object(page, "_parse_images_from_content_stream", return_value={"~0~": inline_image}): assert page._get_image("~0~") is inline_image @@ -239,8 +240,8 @@ def test_get_inline_image_without_xobject_resources_raises_when_missing(): page = PageObject(None, None) with ( - mock.patch.object(page, "_get_inline_images", return_value=None), - pytest.raises(KeyError, match="No inline image can be found"), + mock.patch.object(page, "_parse_images_from_content_stream", return_value=None), + pytest.raises(KeyError, match="No image can be found"), ): page._get_image("~0~") @@ -311,6 +312,7 @@ def test_separation_1byte_to_rgb_inverted(): assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 obj = reader.pages[0].images[0].indirect_reference.get_object() obj.set_data(obj.get_data() + b"\x00") + reader.pages[0]._content_stream_images = None # invalidate cache with pytest.raises(ValueError): reader.pages[0].images[0] @@ -442,9 +444,9 @@ def test_inline_image_extraction(): assert image_similarity(writer.pages[0].images[i].image, img) == 1 writer.pages[0].extract_text() # check recalculation of inline images - assert writer.pages[0].inline_images is not None + assert writer.pages[0]._content_stream_images is not None writer.pages[0].merge_scaled_page(writer.pages[0], 0.25) - assert writer.pages[0].inline_images is None + assert writer.pages[0]._content_stream_images is None reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") writer.pages[0].merge_page(reader.pages[0]) assert list(writer.pages[0].images.keys()) == [ @@ -556,7 +558,7 @@ def test_contentstream__read_inline_image__fallback_is_successful(): ) page = PageObject(pdf=None) with mock.patch.object(page, "get_contents", return_value=stream): - images = page._get_inline_images() + images = page._parse_images_from_content_stream() assert list(images) == ["~0~"] assert images["~0~"].data == ( b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x0f" @@ -658,3 +660,108 @@ def test_get_ids_image__resources_is_none(): reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name))) page = reader.pages[2] assert list(page.images.items()) == [] + + +@pytest.mark.samples +def test_is_xobject_image_displayed(): + """Test XObject image display detection with expected results.""" + path = SAMPLE_ROOT / "028-image-references-deduplication/wrong-references.pdf" + reader = PdfReader(path) + + expected_results = [ + # Page 1: /Im20 not displayed, /Im8 displayed + (0, "/Im20", False), + (0, "/Im8", True), + # Page 2: Neither displayed + (1, "/Im20", False), + (1, "/Im8", False), + # Page 3: /Im20 displayed, /Im8 not displayed + (2, "/Im20", True), + (2, "/Im8", False), + ] + + for page_num, image_id, expected in expected_results: + img = reader.pages[page_num].images[image_id] + assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" + + +@pytest.mark.samples +def test_is_inline_image_displayed(): + """This test ensures that displayed inline images are detected by `ImageFile.is_displayed`""" + path = SAMPLE_ROOT / "008-reportlab-inline-image/inline-image.pdf" + reader = PdfReader(path) + + # Page 1: + expected_results = [ + (0, "~0~", True), + ] + + for page_num, image_id, expected in expected_results: + page = reader.pages[page_num] + img = page.images[image_id] + assert img.is_displayed == expected, f"Page {page_num}: {image_id} expected {expected}, got {img.is_displayed}" + + +@pytest.mark.samples +def test_inline_images_property_deprecation_warning(): + """Test that inline_images property emits a deprecation warning.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + _ = page.inline_images + assert len(w) == 1 + assert issubclass(w[0].category, DeprecationWarning) + assert "inline_images" in str(w[0].message) + assert "images" in str(w[0].message) + + +@pytest.mark.samples +def test_inline_images_property_returns_only_inline(): + """Test that inline_images returns only images with is_inline=True.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + inline = page.inline_images + if inline is not None: + for k, v in inline.items(): + assert v.is_inline is True, f"Image {k} should have is_inline=True" + + +@pytest.mark.samples +def test_inline_images_setter_clears_cache(): + """Test that setting inline_images to None clears the cache.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + # Force cache population by accessing images + _ = list(page.images) + assert page._content_stream_images is not None + + # Clear cache via setter + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + page.inline_images = None + assert page._content_stream_images is None + + +@pytest.mark.samples +def test_inline_images_setter_merges(): + """Test that setting inline_images to a dict merges into the cache.""" + reader = PdfReader(RESOURCE_ROOT / "imagemagick-ASCII85Decode.pdf") + page = reader.pages[0] + + # Force cache population by accessing images + _ = list(page.images) + original_keys = set(page._content_stream_images.keys()) + + # Merge new values + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + page.inline_images = {"new_key": page.images[0]} + merged_keys = set(page._content_stream_images.keys()) + assert original_keys.issubset(merged_keys), "Original keys should be preserved" + assert "new_key" in merged_keys, "New key should be added" diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 401015666..cdf870371 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1032,7 +1032,7 @@ def test_inline_images(): with pytest.raises(KeyError): reader.pages[0].images["~999~"] del reader.pages[1]["/Resources"]["/ColorSpace"]["/R124"] - reader.pages[1].inline_images = None # to force recalculation + reader.pages[1]._content_stream_images = None # to force recalculation with pytest.raises(PdfReadError): reader.pages[1].images["~1~"]