From 4a09f8530acce1e62fd7e033099940f0438fb2c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Tue, 23 Dec 2025 13:36:04 +0100 Subject: [PATCH] Bugfix --- src/EventListener/IndexPageListener.php | 106 +++++++++++++++++++++++- 1 file changed, 102 insertions(+), 4 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 94b9b45..27dc811 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -2,6 +2,8 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener; +use Smalot\PdfParser\Parser; + class IndexPageListener { public function onIndexPage(string $content, array &$data, array &$set): void @@ -66,9 +68,9 @@ class IndexPageListener * ===================== */ $image = - $parsed['event']['searchimage'] ?? null ?? - $parsed['news']['searchimage'] ?? null ?? - $parsed['page']['searchimage'] ?? null ?? + $parsed['event']['searchimage'] ?? null ?? + $parsed['news']['searchimage'] ?? null ?? + $parsed['page']['searchimage'] ?? null ?? $parsed['custom']['searchimage'] ?? null; if (is_string($image) && $image !== '') { @@ -77,7 +79,7 @@ class IndexPageListener /* * ===================== - * STARTDATE (event.date/news.date => timestamp) + * STARTDATE (event/news) * ===================== */ $date = @@ -90,8 +92,24 @@ class IndexPageListener $set['startDate'] = $ts; } } + + /* + * ===================== + * PDF LINKS INDEXIEREN + * ===================== + */ + $pdfText = $this->extractPdfTextFromContent($content); + + if ($pdfText !== '') { + $set['text'] = trim( + ($set['text'] ?? '') . "\n\n" . $pdfText + ); + } } + /** + * Extrahiert das JSON aus + */ private function extractMeilisearchJson(string $content): ?array { if (!preg_match('//s', $content, $m)) { @@ -103,4 +121,84 @@ class IndexPageListener return is_array($data) ? $data : null; } + + /** + * Findet PDF-Links im HTML und extrahiert deren Text + */ + private function extractPdfTextFromContent(string $content): string + { + if (!preg_match_all( + '/]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>/i', + $content, + $matches + )) { + return ''; + } + + $texts = []; + + foreach ($matches[1] as $url) { + $pdfText = $this->parsePdfFromUrl($url); + + if ($pdfText !== '') { + $texts[] = $pdfText; + } + } + + return implode("\n\n", $texts); + } + + /** + * Lädt und parsed ein PDF (nur /files/) + */ + private function parsePdfFromUrl(string $url): string + { + // Nur interne PDFs + if (!str_contains($url, '/files/')) { + return ''; + } + + // relative URLs normalisieren + if (str_starts_with($url, '/')) { + $url = + ($_SERVER['REQUEST_SCHEME'] ?? 'https') + . '://' . ($_SERVER['HTTP_HOST'] ?? '') + . $url; + } + + try { + $pdfContent = @file_get_contents($url); + if (!$pdfContent) { + return ''; + } + + $parser = new Parser(); + $pdf = $parser->parseContent($pdfContent); + + $text = $this->cleanPdfContent($pdf->getText()); + + // Begrenzung für Meilisearch + return mb_substr($text, 0, 2000); + + } catch (\Throwable) { + return ''; + } + } + + /** + * Bereinigt PDF-Text + */ + private function cleanPdfContent(string $content): string + { + // UTF-8 normalisieren + $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); + + // Steuerzeichen entfernen + $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content); + + // Whitespaces normalisieren + $content = preg_replace('/\s+/u', ' ', $content); + + return trim($content); + } } \ No newline at end of file