From a446c9587bc5506fb571700ff10c9831cde73606 Mon Sep 17 00:00:00 2001 From: Markus Weigelt Date: Mon, 1 Sep 2025 18:09:04 +0200 Subject: [PATCH 1/2] [FEATURE] Indexing TEI full texts (#1712) Co-authored-by: Sebastian Meyer --- Classes/Common/FullTextReader.php | 11 ++- Classes/Common/FulltextInterface.php | 10 ++ Classes/Format/Alto.php | 9 +- Classes/Format/Tei.php | 132 +++++++++++++++++++++++++++ Documentation/User/Index.rst | 74 ++++++++++++++- Tests/Fixtures/Format/tei.xml | 32 +++++++ Tests/Unit/Format/TeiTest.php | 54 +++++++++++ 7 files changed, 316 insertions(+), 6 deletions(-) create mode 100644 Classes/Format/Tei.php create mode 100644 Tests/Fixtures/Format/tei.xml create mode 100644 Tests/Unit/Format/TeiTest.php diff --git a/Classes/Common/FullTextReader.php b/Classes/Common/FullTextReader.php index e3f060d74a..9f430321e1 100644 --- a/Classes/Common/FullTextReader.php +++ b/Classes/Common/FullTextReader.php @@ -33,7 +33,7 @@ class FullTextReader /** * Constructor - * + * * @param array $formats */ public function __construct(array $formats) @@ -44,7 +44,7 @@ public function __construct(array $formats) /** * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an - * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have + * XML full text representation. For IIIF manifests, ALTO documents have * to be given in the Canvas' / Manifest's "seeAlso" property. * * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property @@ -83,7 +83,7 @@ public function getFromXml(string $id, array $fileLocations, $physicalStructureN if (!empty($fileContent) && !empty($this->formats[$textFormat])) { $textMiniOcr = ''; if (!empty($this->formats[$textFormat]['class'])) { - $textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat); + $textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat); } $fullText = $textMiniOcr; } else { @@ -98,12 +98,14 @@ public function getFromXml(string $id, array $fileLocations, $physicalStructureN * * @access private * + * @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property + * of the Manifest / Range (IIIF) * @param string $fileContent The content of the XML file * @param string $textFormat * * @return string */ - private function getRawTextFromClass(string $fileContent, string $textFormat): string + private function getRawTextFromClass(string $id, string $fileContent, string $textFormat): string { $textMiniOcr = ''; $class = $this->formats[$textFormat]['class']; @@ -113,6 +115,7 @@ private function getRawTextFromClass(string $fileContent, string $textFormat): s if ($obj instanceof FulltextInterface) { // Load XML from file. $ocrTextXml = Helper::getXmlFileAsString($fileContent); + $obj->setPageId($id); $textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml); } else { $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); diff --git a/Classes/Common/FulltextInterface.php b/Classes/Common/FulltextInterface.php index 2b0c3098d7..da6c9b7d9b 100644 --- a/Classes/Common/FulltextInterface.php +++ b/Classes/Common/FulltextInterface.php @@ -24,6 +24,16 @@ */ interface FulltextInterface { + + /** + * Set the page identifier. + * + * @access public + * + * @param string $pageId The page identifier of mets:div in the physical struct map of the METS. + */ + public function setPageId(string $pageId): void; + /** * This extracts raw fulltext data from XML * diff --git a/Classes/Format/Alto.php b/Classes/Format/Alto.php index c358f16931..c7f9771956 100644 --- a/Classes/Format/Alto.php +++ b/Classes/Format/Alto.php @@ -12,6 +12,8 @@ namespace Kitodo\Dlf\Format; +use Kitodo\Dlf\Common\FulltextInterface; + /** * Fulltext ALTO format class for the 'dlf' extension * @@ -22,7 +24,7 @@ * * @access public */ -class Alto implements \Kitodo\Dlf\Common\FulltextInterface +class Alto implements FulltextInterface { /** * This extracts the fulltext data from ALTO XML @@ -159,4 +161,9 @@ private function registerAltoNamespace(\SimpleXMLElement &$xml) $xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v4#'); } } + + public function setPageId(string $pageId): void + { + // Nothing to do here. + } } diff --git a/Classes/Format/Tei.php b/Classes/Format/Tei.php new file mode 100644 index 0000000000..e5d2c1de6b --- /dev/null +++ b/Classes/Format/Tei.php @@ -0,0 +1,132 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Format; + +use Kitodo\Dlf\Common\FulltextInterface; +use Psr\Log\LoggerAwareInterface; +use Psr\Log\LoggerAwareTrait; +use SimpleXMLElement; + +/** + * Fulltext ALTO format class for the 'dlf' extension + * + * ** This currently supports ALTO 2.x / 3.x / 4.x ** + * + * @package TYPO3 + * @subpackage dlf + * + * @access public + */ +class Tei implements FulltextInterface, LoggerAwareInterface +{ + use LoggerAwareTrait; + + private string $pageId; + + public function setPageId(string $pageId): void + { + $this->pageId = $pageId; + } + + /** + * This extracts the fulltext data from TEI XML + * + * @access public + * + * @param \SimpleXMLElement $xml The XML to extract the raw text from + * + * @return string The raw unformatted fulltext + */ + public function getRawText(\SimpleXMLElement $xml): string + { + if (empty($this->pageId)) { + $this->logger->warning('Text could not be retrieved from TEI because the page ID is empty.'); + return ''; + } + + // register ALTO namespace depending on document + $this->registerTeiNamespace($xml); + + // Get all (presumed) words of the text. + $contentXml = $xml->xpath('./TEI:text')[0]->asXML(); + + // Remove tags but keep their content + $contentXml = preg_replace('/<\/?(?:body|front|div|head|titlePage)[^>]*>/u', '', $contentXml); + + // Replace linebreaks + $contentXml = preg_replace('/]*)?\/>/u', '', $contentXml); + $contentXml = preg_replace('/\s+/', ' ', $contentXml); + + // Extract content between each and the next or end of string + $pattern = '/]*facs="([^"]+)"[^>]*\/>([\s\S]*?)(?=]*\/>|$)/u'; + $facs = []; + + // Use preg_match_all to get all matches at once + if (preg_match_all($pattern, $contentXml, $matches, PREG_SET_ORDER)) { + foreach ($matches as $match) { + $facsMatch = trim($match[1]); + $facsId = str_starts_with($facsMatch, "#") ? substr($facsMatch, 1) : $facsMatch; + $facs[$facsId] = trim(strip_tags($match[2])); // Everything until next or end of string + } + } + + if (!array_key_exists($this->pageId, $facs)) { + $this->logger->debug('The page break attribute "facs" with the page identifier postfix "' . $this->pageId . '" could not be found in the TEI document'); + return ''; + } + + return $facs[$this->pageId]; + } + + /** + * This extracts the fulltext data from TEI XML and returns it in MiniOCR format + * + * @access public + * + * @param \SimpleXMLElement $xml The XML to extract the raw text from + * + * @return string The unformatted fulltext in MiniOCR format + */ + public function getTextAsMiniOcr(\SimpleXMLElement $xml): string + { + $rawText = $this->getRawText($xml); + + if (empty($rawText)) { + return ''; + } + + $miniOcr = new SimpleXMLElement(""); + $miniOcr->addChild('b', $rawText); + $miniOcrXml = $miniOcr->asXml(); + if (\is_string($miniOcrXml)) { + return $miniOcrXml; + } + return ''; + } + + /** + * This registers the necessary TEI namespace for the current TEI-XML + * + * @access private + * + * @param \SimpleXMLElement &$xml: The XML to register the namespace for + */ + private function registerTeiNamespace(\SimpleXMLElement $xml) + { + $namespace = $xml->getDocNamespaces(); + + if (in_array('http://www.tei-c.org/ns/1.0', $namespace, true)) { + $xml->registerXPathNamespace('TEI', 'http://www.tei-c.org/ns/1.0'); + } + } +} diff --git a/Documentation/User/Index.rst b/Documentation/User/Index.rst index 4596f9a402..159171d0d3 100644 --- a/Documentation/User/Index.rst +++ b/Documentation/User/Index.rst @@ -16,7 +16,6 @@ User Manual :local: :depth: 2 - .. _indexing_documents: Indexing Documents @@ -545,3 +544,76 @@ With the command `kitodo:optimize` it is possible to hard commit documents to an Show each processed documents uid and location with timestamp and amount of processed/all documents. :Example: + + +.. _indexing_fulltexts: + +Indexing full texts +================== + +Full texts must be provided in the ``FULLTEXT`` file group within the METS. Kitodo.Presentation supports the ALTO and TEI format for indexing full texts. + +**ALTO** + +Each ALTO file contains the full text of a single page of the document. + +.. code-block:: xml + + + + + + + + ... + + +**TEI** + +TEI contains all full texts of the entire document. + +.. code-block:: xml + + + + + + +.. note:: + + The identifier of the ``facsimile`` tag (and thus the ``pb`` tag (page break) references) in the TEI must match the ``ID`` attribute of the ``mets:div`` with type ``page`` in the physical structMap of the METS. Otherwise, the pages cannot be mapped and will not be indexed. + + +For indexing full texts, the formats need to be defined in the Data Formats or in the table ``tx_dlf_formats`` with following settings. + +.. t3-field-list-table:: + :header-rows: 1 + + - :Type: + Format Name (e.g. in METS) + :Root: + Root Element + :Namespace: + Namespace URI + :Class: + Class Name + + - :Type: + ALTO + :Root: + alto + :Namespace: + http://www.loc.gov/standards/alto/ns-v2# + :Class: + ``Kitodo\Dlf\Format\Alto`` + + - :Type: + TEI + :Root: + TEI + :Namespace: + http://www.tei-c.org/ns/1.0 + :Class: + ``Kitodo\Dlf\Format\Tei`` + +After configuration, all full texts will be indexed when executing the commands of :ref:`indexing_documents`. diff --git a/Tests/Fixtures/Format/tei.xml b/Tests/Fixtures/Format/tei.xml new file mode 100644 index 0000000000..0b2db72cde --- /dev/null +++ b/Tests/Fixtures/Format/tei.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + +

+ Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + +

+
+
+ +
+ +
+
+ + + Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. + +
+ + +
+
diff --git a/Tests/Unit/Format/TeiTest.php b/Tests/Unit/Format/TeiTest.php new file mode 100644 index 0000000000..1c409048f2 --- /dev/null +++ b/Tests/Unit/Format/TeiTest.php @@ -0,0 +1,54 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Tests\Unit\Format; + +use Kitodo\Dlf\Format\Tei; +use TYPO3\TestingFramework\Core\Unit\UnitTestCase; + +class TeiTest extends UnitTestCase +{ + /** + * @test + * @group extract data + */ + public function getRawData(): void + { + $xml = simplexml_load_file(__DIR__ . '/../../Fixtures/Format/tei.xml'); + $tei = new Tei(); + $tei->setPageId('f0001'); + $rawText = $tei->getRawText($xml); + + self::assertEquals('Lorem ipsum dolor sit amet, consectetuer adipiscing elit.', $rawText); + } + + /** + * @test + * @group extract data + */ + public function getTextAsMiniOcr(): void + { + $xml = simplexml_load_file(__DIR__ . '/../../Fixtures/Format/tei.xml'); + $tei = new Tei(); + $tei->setPageId('f0002'); + $rawText = $tei->getTextAsMiniOcr($xml); + + $miniOCR = << + Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. + + XML; + + self::assertXmlStringEqualsXmlString($miniOCR, $rawText); + } + +} From b67ee11fb0158fe123819dfbde0f052a7ef4b7ca Mon Sep 17 00:00:00 2001 From: Sebastian Meyer Date: Wed, 3 Sep 2025 18:31:42 +0200 Subject: [PATCH 2/2] Fix code styling --- Tests/Unit/Format/TeiTest.php | 1 - 1 file changed, 1 deletion(-) diff --git a/Tests/Unit/Format/TeiTest.php b/Tests/Unit/Format/TeiTest.php index 1c409048f2..d30d05afc1 100644 --- a/Tests/Unit/Format/TeiTest.php +++ b/Tests/Unit/Format/TeiTest.php @@ -50,5 +50,4 @@ public function getTextAsMiniOcr(): void self::assertXmlStringEqualsXmlString($miniOCR, $rawText); } - }