-
Notifications
You must be signed in to change notification settings - Fork 47
[FEATURE] Indexing TEI full texts #1712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 7 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
f5ba636
Add TEI fulltext interface, test and fixture
markusweigelt be248cb
Add indexing of TEI fulltext using page id, add tests to retrieve ful…
markusweigelt 18efa5b
Merge branch 'kitodo:main' into tei-fulltext
markusweigelt 0880dff
Add fulltext documentation
markusweigelt 38dbdfe
Update documentation and improve TEI xml
markusweigelt dd819b7
Improve comments
markusweigelt d0189c7
Rename variable contentHTMl to contentXml
markusweigelt 31358b1
Update Classes/Common/FulltextInterface.php
sebastian-meyer c2a7036
Update Classes/Format/Tei.php
sebastian-meyer c603a80
Update Classes/Format/Tei.php
sebastian-meyer 2b9ec2a
Update Classes/Format/Tei.php
sebastian-meyer d9c1166
Update Classes/Format/Tei.php
sebastian-meyer 4b6e748
Update Classes/Format/Tei.php
sebastian-meyer b3d5370
Merge branch '6.x' into tei-fulltext
sebastian-meyer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| <?php | ||
|
|
||
| /** | ||
| * (c) Kitodo. Key to digital objects e.V. <contact@kitodo.org> | ||
| * | ||
| * This file is part of the Kitodo and TYPO3 projects. | ||
| * | ||
| * @license GNU General Public License version 3 or later. | ||
| * For the full copyright and license information, please read the | ||
| * LICENSE.txt file that was distributed with this source code. | ||
| */ | ||
|
|
||
| namespace Kitodo\Dlf\Format; | ||
|
|
||
| use Kitodo\Dlf\Common\FulltextInterface; | ||
| use Psr\Log\LoggerAwareInterface; | ||
| use Psr\Log\LoggerAwareTrait; | ||
|
sebastian-meyer marked this conversation as resolved.
|
||
|
|
||
| /** | ||
| * Fulltext ALTO format class for the 'dlf' extension | ||
| * | ||
| * ** This currently supports ALTO 2.x / 3.x / 4.x ** | ||
| * | ||
| * @package TYPO3 | ||
| * @subpackage dlf | ||
| * | ||
| * @access public | ||
| */ | ||
| class Tei implements FulltextInterface, LoggerAwareInterface | ||
| { | ||
| use LoggerAwareTrait; | ||
|
|
||
| private string $pageId; | ||
|
|
||
| public function setPageId(string $pageId): void | ||
| { | ||
| $this->pageId = $pageId; | ||
| } | ||
|
|
||
| /** | ||
| * This extracts the fulltext data from TEI XML | ||
| * | ||
| * @access public | ||
| * | ||
| * @param \SimpleXMLElement $xml The XML to extract the raw text from | ||
| * | ||
| * @return string The raw unformatted fulltext | ||
| */ | ||
| public function getRawText(\SimpleXMLElement $xml): string | ||
| { | ||
| if(empty($this->pageId)) { | ||
|
Check notice on line 51 in Classes/Format/Tei.php
|
||
|
sebastian-meyer marked this conversation as resolved.
Outdated
|
||
| $this->logger->warning('Text could not be retrieved from TEI because the page ID is empty.'); | ||
| return ''; | ||
| } | ||
|
|
||
| // register ALTO namespace depending on document | ||
| $this->registerTeiNamespace($xml); | ||
|
|
||
| // Get all (presumed) words of the text. | ||
| $contentXml = $xml->xpath('./TEI:text')[0]->asXML(); | ||
|
|
||
| // Remove tags but keep their content | ||
| $contentXml = preg_replace('/<\/?(?:body|front|div|head|titlePage)[^>]*>/u', '', $contentXml); | ||
|
|
||
| // Replace linebreaks | ||
| $contentXml = preg_replace('/<lb(?:\s[^>]*)?\/>/u', '', $contentXml); | ||
| $contentXml = preg_replace('/\s+/', ' ', $contentXml); | ||
|
|
||
| // Extract content between each <pb /> and the next <pb /> or end of string | ||
| $pattern = '/<pb[^>]*facs="([^"]+)"[^>]*\/>([\s\S]*?)(?=<pb[^>]*\/>|$)/u'; | ||
| $facs = []; | ||
|
|
||
| // Use preg_match_all to get all matches at once | ||
| if (preg_match_all($pattern, $contentXml, $matches, PREG_SET_ORDER)) { | ||
| foreach ($matches as $match) { | ||
| $facsMatch = trim($match[1]); | ||
| $facsId = str_starts_with($facsMatch, "#") ? substr($facsMatch, 1) : $facsMatch; | ||
| $facs[$facsId] = trim(strip_tags($match[2])); // Everything until next <pb /> or end of string | ||
| } | ||
| } | ||
|
|
||
| if(!array_key_exists($this->pageId, $facs)) { | ||
|
Check notice on line 82 in Classes/Format/Tei.php
|
||
|
sebastian-meyer marked this conversation as resolved.
Outdated
|
||
| $this->logger->debug('The page break attribute "facs" with the page identifier postfix "' . $this->pageId . '" could not be found in the TEI document'); | ||
| return ''; | ||
| } | ||
|
|
||
| return $facs[$this->pageId]; | ||
| } | ||
|
|
||
| /** | ||
| * This extracts the fulltext data from TEI XML and returns it in MiniOCR format | ||
| * | ||
| * @access public | ||
| * | ||
| * @param \SimpleXMLElement $xml The XML to extract the raw text from | ||
| * | ||
| * @return string The unformatted fulltext in MiniOCR format | ||
| */ | ||
| public function getTextAsMiniOcr(\SimpleXMLElement $xml): string | ||
| { | ||
| $rawText = $this->getRawText($xml); | ||
|
|
||
| if (empty($rawText)) { | ||
| return ''; | ||
| } | ||
|
|
||
| $miniOcr = new \SimpleXMLElement("<ocr></ocr>"); | ||
|
sebastian-meyer marked this conversation as resolved.
Outdated
|
||
| $miniOcr->addChild('b', $rawText); | ||
| $miniOcrXml = $miniOcr->asXml(); | ||
| if (\is_string($miniOcrXml)) { | ||
| return $miniOcrXml; | ||
| } | ||
| return ''; | ||
| } | ||
|
|
||
| /** | ||
| * This registers the necessary TEI namespace for the current TEI-XML | ||
| * | ||
| * @access private | ||
| * | ||
| * @param \SimpleXMLElement &$xml: The XML to register the namespace for | ||
| */ | ||
| private function registerTeiNamespace(\SimpleXMLElement $xml) | ||
| { | ||
| $namespace = $xml->getDocNamespaces(); | ||
|
|
||
| if (in_array('http://www.tei-c.org/ns/1.0', $namespace, true)) { | ||
| $xml->registerXPathNamespace('TEI', 'http://www.tei-c.org/ns/1.0'); | ||
| } | ||
| } | ||
|
|
||
| } | ||
|
sebastian-meyer marked this conversation as resolved.
Outdated
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| <?xml version="1.0" encoding="UTF-8"?> | ||
| <TEI xmlns="http://www.tei-c.org/ns/1.0"> | ||
| <teiHeader> | ||
| </teiHeader> | ||
| <facsimile> | ||
| <graphic mimeType="image/jpeg" url="https://www.example.com/00000001.tif.original.jpg" id="f0001"/> | ||
| <graphic mimeType="image/jpeg" url="https://www.example.com/00000002.tif.original.jpg" id="f0002"/> | ||
| </facsimile> | ||
| <text> | ||
| <front> | ||
| <titlePage id="uuid-82add175-7012-4a6d-bc13-a1a666acb769"> | ||
| <pb facs="#f0001" n=" - " corresp="https://www.example.com/0001"/> | ||
| <p> | ||
| <lb/>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. | ||
|
|
||
| </p> | ||
| </titlePage> | ||
| </front> | ||
| <body> | ||
| <div id="uuid-cf72f6ba-61a0-41b3-ba9b-a6331b7a504b" n="1" rend="Content"> | ||
| <pb facs="#f0002" n=" - " corresp="https://www.example.com/0002"/> | ||
| </div> | ||
| <div id="uuid-45e92103-ecd2-46ab-aabd-ddc589a548d2" n="1" rend="Aenean commodo ligula eget dolor"> | ||
| <head> | ||
| <lb/> | ||
| Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. | ||
| </head> | ||
| </div> | ||
| </body> | ||
| <back/> | ||
| </text> | ||
| </TEI> |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.