diff --git a/docs/user/security.md b/docs/user/security.md index d26ab51b1f..7bfc99195c 100644 --- a/docs/user/security.md +++ b/docs/user/security.md @@ -50,6 +50,13 @@ For *PdfWriter* instances, the following limits are employed for incremental rea * `incremental_clone_object_id_limit` limits the maximum object ID to read during cloning. It defaults to 1 000 000. Setting it to `None` will fully disable this limit. +### XMP + +For reading the XML-based XMP metadata, the following limits apply: + +* `pypdf.xmp.XMP_MAX_INPUT_LENGTH` for the maximum stream length, defaulting to 5 MB. +* `pypdf.xmp.XMP_MAX_ELEMENT_COUNT` for the maximum number of elements, defaulting to 100 000. + ## Reporting possible vulnerabilities Please refer to our [security policy](https://github.com/py-pdf/pypdf/security/policy). diff --git a/pypdf/xmp.py b/pypdf/xmp.py index 8e399a5a99..6ccf1fe29a 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -19,13 +19,17 @@ from xml.dom.expatbuilder import ExpatBuilderNS from xml.dom.minidom import Document from xml.dom.minidom import Element as XmlElement +from xml.dom.xmlbuilder import Options from xml.parsers.expat import ExpatError, XMLParserType from ._protocols import XmpInformationProtocol from ._utils import StreamType, deprecate_with_replacement, deprecation_no_replacement -from .errors import PdfReadError, XmpDocumentError +from .errors import LimitReachedError, PdfReadError, XmpDocumentError from .generic import ContentStream, PdfObject +XMP_MAX_INPUT_LENGTH = 5_000_000 +XMP_MAX_ELEMENT_COUNT = 100_000 + RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" @@ -173,6 +177,10 @@ class _XmpBuilder(ExpatBuilderNS): but not cases like quadratic entity expansion which can still cause quite some memory usage. """ + def __init__(self, options: Optional[Options] = None) -> None: + super().__init__(options=options) + self._element_count = 0 + def custom_entity_declaration_handler( self, entity_name: str, @@ -185,10 +193,17 @@ def custom_entity_declaration_handler( ) -> None: raise ExpatError(f"Forbidden entities: {entity_name!r}") + def start_element_handler(self, name: str, attributes: list[str]) -> None: + self._element_count += 1 + if self._element_count > XMP_MAX_ELEMENT_COUNT: + raise LimitReachedError(f"XMP metadata exceeds limit of {XMP_MAX_ELEMENT_COUNT} elements.") + super().start_element_handler(name=name, attributes=attributes) + def install(self, parser: XMLParserType) -> None: super().install(parser) parser.EntityDeclHandler = self.custom_entity_declaration_handler + parser.StartElementHandler = self.start_element_handler class XmpInformation(XmpInformationProtocol, PdfObject): @@ -205,6 +220,8 @@ def __init__(self, stream: ContentStream) -> None: self.stream = stream try: data = self.stream.get_data() + if (length := len(data)) > XMP_MAX_INPUT_LENGTH: + raise LimitReachedError(f"XMP stream size {length} exceeds limit of {XMP_MAX_INPUT_LENGTH}.") doc_root: Document = _XmpBuilder().parseString(data) except (AttributeError, ExpatError) as e: raise PdfReadError(f"XML in XmpInformation was invalid: {e}") diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 213e725cee..7cbe33bbce 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -7,7 +7,7 @@ import pypdf.generic import pypdf.xmp from pypdf import PdfReader, PdfWriter -from pypdf.errors import PdfReadError, XmpDocumentError +from pypdf.errors import LimitReachedError, PdfReadError, XmpDocumentError from pypdf.generic import ContentStream, NameObject, StreamObject from pypdf.xmp import XmpInformation @@ -963,3 +963,34 @@ def test_xmp_information__quadratic_entity_expansion(): match=r"^XML in XmpInformation was invalid: Forbidden entities: 'a'$" ): XmpInformation(stream) + + +@pytest.mark.timeout(10) +def test_xmp_information__input_limit(): + stream = ContentStream(pdf=None, stream=None) + stream.set_data(b"A" * 10_000_000) + + with pytest.raises( + expected_exception=LimitReachedError, + match=r"^XMP stream size 10000000 exceeds limit of 5000000\.$" + ): + XmpInformation(stream) + + +@pytest.mark.timeout(10) +def test_xmp_information__element_limit(): + stream = ContentStream(pdf=None, stream=None) + + xmp = b'\n' + xmp += b'' + xmp += b'' + xmp += b'' + xmp += b"" * 100_010 + xmp += b"" + stream.set_data(xmp) + + with pytest.raises( + expected_exception=LimitReachedError, + match=r"^XMP metadata exceeds limit of 100000 elements\.$" + ): + XmpInformation(stream)