Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/user/security.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ For *PdfWriter* instances, the following limits are employed for incremental rea
* `incremental_clone_object_id_limit` limits the maximum object ID to read during cloning. It defaults to
1 000 000. Setting it to `None` will fully disable this limit.

### XMP

For reading the XML-based XMP metadata, the following limits apply:

* `pypdf.xmp.XMP_MAX_INPUT_LENGTH` for the maximum stream length, defaulting to 5 MB.
* `pypdf.xmp.XMP_MAX_ELEMENT_COUNT` for the maximum number of elements, defaulting to 100 000.

## Reporting possible vulnerabilities

Please refer to our [security policy](https://github.com/py-pdf/pypdf/security/policy).
Expand Down
19 changes: 18 additions & 1 deletion pypdf/xmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@
from xml.dom.expatbuilder import ExpatBuilderNS
from xml.dom.minidom import Document
from xml.dom.minidom import Element as XmlElement
from xml.dom.xmlbuilder import Options
from xml.parsers.expat import ExpatError, XMLParserType

from ._protocols import XmpInformationProtocol
from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement
from .errors import PdfReadError, XmpDocumentError
from .errors import LimitReachedError, PdfReadError, XmpDocumentError
from .generic import ContentStream, PdfObject

XMP_MAX_INPUT_LENGTH = 5_000_000
XMP_MAX_ELEMENT_COUNT = 100_000

RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
Expand Down Expand Up @@ -173,6 +177,10 @@ class _XmpBuilder(ExpatBuilderNS):
but not cases like quadratic entity expansion which can still cause quite some memory usage.
"""

def __init__(self, options: Optional[Options] = None) -> None:
super().__init__(options=options)
self._element_count = 0

def custom_entity_declaration_handler(
self,
entity_name: str,
Expand All @@ -185,10 +193,17 @@ def custom_entity_declaration_handler(
) -> None:
raise ExpatError(f"Forbidden entities: {entity_name!r}")

def start_element_handler(self, name: str, attributes: list[str]) -> None:
self._element_count += 1
if self._element_count > XMP_MAX_ELEMENT_COUNT:
raise LimitReachedError(f"XMP metadata exceeds limit of {XMP_MAX_ELEMENT_COUNT} elements.")
super().start_element_handler(name=name, attributes=attributes)

def install(self, parser: XMLParserType) -> None:
super().install(parser)

parser.EntityDeclHandler = self.custom_entity_declaration_handler
parser.StartElementHandler = self.start_element_handler


class XmpInformation(XmpInformationProtocol, PdfObject):
Expand All @@ -205,6 +220,8 @@ def __init__(self, stream: ContentStream) -> None:
self.stream = stream
try:
data = self.stream.get_data()
if (length := len(data)) > XMP_MAX_INPUT_LENGTH:
raise LimitReachedError(f"XMP stream size {length} exceeds limit of {XMP_MAX_INPUT_LENGTH}.")
doc_root: Document = _XmpBuilder().parseString(data)
except (AttributeError, ExpatError) as e:
raise PdfReadError(f"XML in XmpInformation was invalid: {e}")
Expand Down
33 changes: 32 additions & 1 deletion tests/test_xmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pypdf.generic
import pypdf.xmp
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError, XmpDocumentError
from pypdf.errors import LimitReachedError, PdfReadError, XmpDocumentError
from pypdf.generic import ContentStream, NameObject, StreamObject
from pypdf.xmp import XmpInformation

Expand Down Expand Up @@ -963,3 +963,34 @@ def test_xmp_information__quadratic_entity_expansion():
match=r"^XML in XmpInformation was invalid: Forbidden entities: 'a'$"
):
XmpInformation(stream)


@pytest.mark.timeout(10)
def test_xmp_information__input_limit():
stream = ContentStream(pdf=None, stream=None)
stream.set_data(b"A" * 10_000_000)

with pytest.raises(
expected_exception=LimitReachedError,
match=r"^XMP stream size 10000000 exceeds limit of 5000000\.$"
):
XmpInformation(stream)


@pytest.mark.timeout(10)
def test_xmp_information__element_limit():
stream = ContentStream(pdf=None, stream=None)

xmp = b'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
xmp += b'<x:xmpmeta xmlns:x="adobe:ns:meta/">'
xmp += b'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">'
xmp += b'<rdf:Description rdf:about="" xmlns:custom="urn:custom">'
xmp += b"<custom:a/>" * 100_010
xmp += b"</rdf:Description></rdf:RDF></x:xmpmeta>"
stream.set_data(xmp)

with pytest.raises(
expected_exception=LimitReachedError,
match=r"^XMP metadata exceeds limit of 100000 elements\.$"
):
XmpInformation(stream)
Loading