From 0db8942229af1290bec881269dab973dadf0462a Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 12 Apr 2026 15:02:13 +0200 Subject: [PATCH] Optimize processing of sitemap with Tika - Don't use temporary file for Tika. - Replace two calls of Tika app by a single call. Signed-off-by: Stefan Weil --- .../src/VuFind/XSLT/Import/VuFindSitemap.php | 62 +++++++++++++++---- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php b/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php index 8b2706ae19b3..ec4b66b9ec6b 100644 --- a/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php +++ b/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php @@ -99,6 +99,44 @@ protected static function getApertureFields($htmlFile) ]; } + /** + * Load JSON data about an HTML document using Tika. + * + * @param string $url URL or local file containing HTML. + * + * @return array + */ + protected static function getTikaData($url) + { + // Extract and decode the full text from the XML: + $json = json_decode(static::harvestWithTika($url, '--jsonRecursive --text'), true); + $doc = $json[0]; + + $title = $doc['dc:title'] ?? $doc['title'] ?? ''; + $description = $doc['dc:description'] ?? $doc['description'] ?? ''; + $fulltext = trim($title . ' ' . ($doc['X-TIKA:content'] ?? '')); + $keywords = []; + if (!empty($doc['keywords'])) { + // keywords may come back as a string or an array + $raw = is_array($doc['keywords']) + ? $doc['keywords'] + : [$doc['keywords']]; + foreach ($raw as $current) { + $keywords[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); + } + } + + //print("doc = "); var_dump($doc); + + // Send back the extracted fields: + return [ + 'title' => $title, + 'keywords' => $keywords, + 'description' => $description, + 'fulltext' => $title . ' ' . $fulltext, + ]; + } + /** * Load metadata about an HTML document using Tika. * @@ -247,29 +285,29 @@ protected static function getDocumentFieldArray($url) return []; } - // Grab the HTML and write it to disk: - $htmlFile = tempnam('/tmp', 'htm'); - $html = file_get_contents($url); - file_put_contents($htmlFile, $html); - // Use the appropriate full text parser: switch ($parser) { case 'Aperture': + // Grab the HTML and write it to disk: + $htmlFile = tempnam('/tmp', 'htm'); + $html = file_get_contents($url); + file_put_contents($htmlFile, $html); + $fields = static::getApertureFields($htmlFile); + + // Add data loaded directly from HTML: + $fields += static::getHtmlFields($html); + + // Clean up HTML file: + @unlink($htmlFile); break; case 'Tika': - $fields = static::getTikaFields($htmlFile); + $fields = static::getTikaData($url); break; default: throw new \Exception('Unexpected parser: ' . $parser); } - // Clean up HTML file: - @unlink($htmlFile); - - // Add data loaded directly from HTML: - $fields += static::getHtmlFields($html); - // Clean up/normalize full text: $fields['fulltext'] = trim( preg_replace(