From 8f3a0ad1b2519e9ea6afa098e2435e2d9368b10e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Fri, 26 Dec 2025 11:24:55 +0100 Subject: [PATCH] Bugfix --- src/EventListener/IndexPageListener.php | 184 ++++++++++-------- src/Service/OfficeIndexService.php | 243 ++++++++++++++++++++++++ 2 files changed, 352 insertions(+), 75 deletions(-) create mode 100644 src/Service/OfficeIndexService.php diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 7590ce7..29eab2f 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -5,119 +5,131 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener; use Contao\Config; use Contao\System; use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService; +use MummertMedia\ContaoMeilisearchBundle\Service\OfficeIndexService; class IndexPageListener { private ?PdfIndexService $pdfIndexService = null; + private ?OfficeIndexService $officeIndexService = null; public function onIndexPage(string $content, array &$data, array &$set): void { // ✅ IMMER: Service einmal pro Crawl holen + Tabelle einmal leeren if ($this->pdfIndexService === null) { $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class); - $this->pdfIndexService->resetTableOnce(); // <- darf NICHT von Checkbox abhängen! - } - - // ✅ Checkbox steuert nur die PDF-Suche/Indexierung (nicht den Reset!) - $pdfEnabled = (bool) (Config::get('meilisearchIndexPdfs') ?? Config::get('meilisearch_index_pdfs')); - if (!$pdfEnabled) { - return; - } - - // Marker vorhanden? - if (!str_contains($content, 'MEILISEARCH_JSON')) { - return; - } - - $parsed = $this->extractMeilisearchJson($content); - if ($parsed === null) { - return; + $this->pdfIndexService->resetTableOnce(); // darf NICHT von Checkboxen abhängen } /* * ===================== - * PRIORITY + * SEITEN-METADATEN (IMMER) * ===================== */ - $priority = - $parsed['event']['priority'] ?? null ?? - $parsed['news']['priority'] ?? null ?? - $parsed['page']['priority'] ?? null; + if (str_contains($content, 'MEILISEARCH_JSON')) { + $parsed = $this->extractMeilisearchJson($content); - if ($priority !== null && $priority !== '') { - $set['priority'] = (int) $priority; - } + if (is_array($parsed)) { - /* - * ===================== - * KEYWORDS - * ===================== - */ - $keywordSources = [ - $parsed['event']['keywords'] ?? null, - $parsed['news']['keywords'] ?? null, - $parsed['page']['keywords'] ?? null, - ]; + /* + * PRIORITY + */ + $priority = + $parsed['event']['priority'] ?? null ?? + $parsed['news']['priority'] ?? null ?? + $parsed['page']['priority'] ?? null; - $keywords = []; - foreach ($keywordSources as $src) { - if (!is_string($src) || trim($src) === '') { - continue; - } + if ($priority !== null && $priority !== '') { + $set['priority'] = (int) $priority; + } - foreach (preg_split('/\s+/', trim($src)) as $word) { - $word = trim($word); - if ($word !== '') { - $keywords[] = $word; + /* + * KEYWORDS + */ + $keywordSources = [ + $parsed['event']['keywords'] ?? null, + $parsed['news']['keywords'] ?? null, + $parsed['page']['keywords'] ?? null, + ]; + + $keywords = []; + foreach ($keywordSources as $src) { + if (!is_string($src) || trim($src) === '') { + continue; + } + + foreach (preg_split('/\s+/', trim($src)) as $word) { + $word = trim($word); + if ($word !== '') { + $keywords[] = $word; + } + } + } + + if ($keywords) { + $set['keywords'] = implode(' ', array_unique($keywords)); + } + + /* + * IMAGEPATH + */ + $image = + $parsed['event']['searchimage'] ?? null ?? + $parsed['news']['searchimage'] ?? null ?? + $parsed['page']['searchimage'] ?? null ?? + $parsed['custom']['searchimage'] ?? null; + + if (is_string($image) && $image !== '') { + $set['imagepath'] = trim($image); + } + + /* + * STARTDATE + */ + $date = + $parsed['event']['date'] ?? null ?? + $parsed['news']['date'] ?? null; + + if (is_string($date) && $date !== '') { + $ts = strtotime($date); + if ($ts !== false) { + $set['startDate'] = $ts; + } } } } - if ($keywords) { - $set['keywords'] = implode(' ', array_unique($keywords)); - } - /* * ===================== - * IMAGEPATH + * PDF-INDEXIERUNG (OPTIONAL) * ===================== */ - $image = - $parsed['event']['searchimage'] ?? null ?? - $parsed['news']['searchimage'] ?? null ?? - $parsed['page']['searchimage'] ?? null ?? - $parsed['custom']['searchimage'] ?? null; + $pdfEnabled = (bool) Config::get('meilisearch_index_pdfs'); + if ($pdfEnabled && (int) ($data['protected'] ?? 0) === 0) { - if (is_string($image) && $image !== '') { - $set['imagepath'] = trim($image); - } + $pdfLinks = $this->findPdfLinks($content); - /* - * ===================== - * STARTDATE - * ===================== - */ - $date = - $parsed['event']['date'] ?? null ?? - $parsed['news']['date'] ?? null; - - if (is_string($date) && $date !== '') { - $ts = strtotime($date); - if ($ts !== false) { - $set['startDate'] = $ts; + if ($pdfLinks !== []) { + $this->pdfIndexService->handlePdfLinks($pdfLinks); } } /* * ===================== - * PDF-ERKENNUNG + * OFFICE-INDEXIERUNG (OPTIONAL) * ===================== */ - $pdfLinks = $this->findPdfLinks($content); + $officeEnabled = (bool) Config::get('meilisearch_index_office'); + if ($officeEnabled && (int) ($data['protected'] ?? 0) === 0) { - // PDFs NUR auf öffentlichen Seiten indexieren - if ($pdfLinks !== [] && (int) ($data['protected'] ?? 0) === 0) { - $this->pdfIndexService->handlePdfLinks($pdfLinks); + if ($this->officeIndexService === null) { + $this->officeIndexService = System::getContainer()->get(OfficeIndexService::class); + } + + $officeLinks = $this->findOfficeLinks($content); + + if ($officeLinks !== []) { + $this->officeIndexService->handleOfficeLinks($officeLinks); + } } } @@ -154,4 +166,26 @@ class IndexPageListener return $result; } + + private function findOfficeLinks(string $content): array + { + if (!preg_match_all( + '/]*href=["\']([^"\']*(?:\.(?:docx|xlsx|pptx)|p=(?:docx|xlsx|pptx)(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is', + $content, + $matches + )) { + return []; + } + + $result = []; + + foreach ($matches[1] as $i => $href) { + $result[] = [ + 'url' => html_entity_decode($href), + 'linkText' => trim(strip_tags($matches[2][$i])) ?: null, + ]; + } + + return $result; + } } \ No newline at end of file diff --git a/src/Service/OfficeIndexService.php b/src/Service/OfficeIndexService.php new file mode 100644 index 0000000..8071950 --- /dev/null +++ b/src/Service/OfficeIndexService.php @@ -0,0 +1,243 @@ +projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); + } + + /** + * @param array $officeLinks + */ + public function handleOfficeLinks(array $officeLinks): void + { + foreach ($officeLinks as $row) { + $url = (string) ($row['url'] ?? ''); + $linkText = $row['linkText'] ?? null; + + if ($url === '') { + continue; + } + + try { + error_log('bearbeite Office-Datei: ' . $url); + + // innerhalb des Crawls gleiche URL nicht mehrfach parsen + $seenKey = md5($url); + if (isset($this->seenThisCrawl[$seenKey])) { + error_log('→ übersprungen: bereits im Crawl verarbeitet'); + continue; + } + $this->seenThisCrawl[$seenKey] = true; + + $normalized = $this->normalizeOfficeUrl($url); + if ($normalized === null) { + error_log('→ übersprungen: kein gültiger Office-Pfad'); + continue; + } + + [$relativePath, $type] = $normalized; + + $absolutePath = $this->getAbsolutePath($relativePath); + if (!is_file($absolutePath)) { + error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath); + continue; + } + + $mtime = (int) (filemtime($absolutePath) ?: 0); + $checksum = md5($relativePath . '|' . $mtime); + + $title = $linkText ?: basename($absolutePath); + + $text = $this->parseOfficeFile($absolutePath, $type); + if ($text === '') { + error_log('→ übersprungen: Office-Datei ohne Textinhalt'); + continue; + } + + $this->upsertOffice( + $relativePath, + $title, + $text, + $checksum, + $mtime, + $type + ); + + error_log('geschrieben in tl_search_pdf'); + + } catch (\Throwable $e) { + error_log('Office Service FEHLER: ' . $e->getMessage()); + } + } + } + + /** + * @return array{string,string}|null [relativePath, type] + */ + private function normalizeOfficeUrl(string $url): ?array + { + $decoded = html_entity_decode($url); + $parts = parse_url($decoded); + + // direkter /files/-Pfad + if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/')) { + $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); + if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) { + return [$parts['path'], $ext]; + } + } + + // Contao-Download-Link mit ?p= + if (!empty($parts['query'])) { + parse_str($parts['query'], $query); + + if (!empty($query['p'])) { + $p = rawurldecode((string) $query['p']); + $ext = strtolower(pathinfo($p, PATHINFO_EXTENSION)); + + if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) { + return ['/files/' . ltrim($p, '/'), $ext]; + } + } + } + + return null; + } + + private function getAbsolutePath(string $relativePath): string + { + return $this->projectDir . '/' . ltrim($relativePath, '/'); + } + + private function upsertOffice( + string $url, + string $title, + string $text, + string $checksum, + int $mtime, + string $type + ): void { + $db = Database::getInstance(); + + $db->prepare(' + INSERT INTO tl_search_pdf + (tstamp, type, url, title, text, checksum, file_mtime) + VALUES + (?, ?, ?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE + tstamp=VALUES(tstamp), + type=VALUES(type), + url=VALUES(url), + title=VALUES(title), + text=VALUES(text), + file_mtime=VALUES(file_mtime) + ')->execute( + time(), + $type, + $url, + $title, + $text, + $checksum, + $mtime + ); + } + + private function parseOfficeFile(string $absolutePath, string $type): string + { + return match ($type) { + 'docx' => $this->parseDocx($absolutePath), + 'xlsx' => $this->parseXlsx($absolutePath), + 'pptx' => $this->parsePptx($absolutePath), + default => '', + }; + } + + private function parseDocx(string $absolutePath): string + { + try { + $phpWord = WordIOFactory::load($absolutePath); + $text = ''; + + foreach ($phpWord->getSections() as $section) { + foreach ($section->getElements() as $element) { + if (method_exists($element, 'getText')) { + $text .= ' ' . $element->getText(); + } + } + } + + return $this->cleanText($text); + + } catch (\Throwable) { + return ''; + } + } + + private function parseXlsx(string $absolutePath): string + { + try { + $spreadsheet = SpreadsheetIOFactory::load($absolutePath); + $text = ''; + + foreach ($spreadsheet->getAllSheets() as $sheet) { + foreach ($sheet->toArray() as $row) { + $text .= ' ' . implode(' ', array_filter($row, 'is_scalar')); + } + } + + return $this->cleanText($text); + + } catch (\Throwable) { + return ''; + } + } + + private function parsePptx(string $absolutePath): string + { + try { + $presentation = PresentationIOFactory::load($absolutePath); + $text = ''; + + foreach ($presentation->getAllSlides() as $slide) { + foreach ($slide->getShapeCollection() as $shape) { + if (method_exists($shape, 'getPlainText')) { + $text .= ' ' . $shape->getPlainText(); + } + } + } + + return $this->cleanText($text); + + } catch (\Throwable) { + return ''; + } + } + + private function cleanText(string $text): string + { + if (class_exists(\Normalizer::class)) { + $text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text; + } + + $text = str_replace(["\r\n", "\r"], "\n", $text); + $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); + $text = preg_replace('/\s+/u', ' ', $text); + + return trim(mb_substr($text, 0, 20000)); + } +} \ No newline at end of file