From bbb4d5cc6cb57036f396c3c556e38006fbbb4ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Thu, 25 Dec 2025 21:52:22 +0100 Subject: [PATCH] Bugfix --- src/EventListener/IndexPageListener.php | 13 +- src/Service/PdfIndexService.php | 155 ++++++++++++++++++------ 2 files changed, 127 insertions(+), 41 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index ffdebd1..672879d 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -132,13 +132,22 @@ class IndexPageListener private function findPdfLinks(string $content): array { if (!preg_match_all( - '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i', + '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is', $content, $matches )) { return []; } - return array_unique(array_map('html_entity_decode', $matches[1])); + $result = []; + + foreach ($matches[1] as $i => $href) { + $result[] = [ + 'url' => html_entity_decode($href), + 'linkText' => trim(strip_tags($matches[2][$i])) ?: null, + ]; + } + + return $result; } } \ No newline at end of file diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index 3f826f8..d7b7970 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface; class PdfIndexService { - private bool $tableReset = false; private string $projectDir; + private bool $crawlStarted = false; public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim($params->get('kernel.project_dir'), '/'); } - /** - * 🔥 Wird bei JEDEM Crawl einmal aufgerufen - */ - public function resetTableOnce(): void + /* ===================================================== + * Crawl-Start (immer aufrufen!) + * ===================================================== */ + public function startCrawl(): void { - if ($this->tableReset) { + if ($this->crawlStarted) { return; } - Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); - error_log('tl_search_pdf wurde geleert'); + $this->crawlStarted = true; - $this->tableReset = true; + // bewusst simpel: bei JEDEM Crawl komplett leeren + Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); + + error_log('PDF Crawl gestartet → tl_search_pdf geleert'); } - /** - * Einstiegspunkt vom Listener - */ + /* ===================================================== + * Einstiegspunkt aus IndexPageListener + * ===================================================== */ public function handlePdfLinks(array $pdfLinks): void { - foreach ($pdfLinks as $url) { + foreach ($pdfLinks as $pdf) { try { - $path = $this->normalizePdfUrl($url); - if ($path === null) { + $url = $pdf['url']; + $linkText = $pdf['linkText'] ?? null; + + error_log('bearbeite PDF: ' . $url); + + $relativePath = $this->normalizePdfUrl($url); + if ($relativePath === null) { + error_log('→ übersprungen: kein gültiger PDF-Pfad'); continue; } - $absolutePath = $this->projectDir . '/' . ltrim($path, '/'); + $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/'); if (!is_file($absolutePath)) { + error_log('→ übersprungen: Datei existiert nicht'); continue; } - $parser = new Parser(); - $pdf = $parser->parseFile($absolutePath); - $text = $this->cleanPdfContent($pdf->getText()); + $mtime = filemtime($absolutePath) ?: 0; + $checksum = md5($relativePath . $mtime); + + // PDF parsen + [$text, $metaTitle] = $this->parsePdf($absolutePath); if ($text === '') { + error_log('→ übersprungen: kein Textinhalt'); continue; } - Database::getInstance() - ->prepare(' - INSERT INTO tl_search_pdf - (tstamp, url, title, text, checksum, file_mtime) - VALUES (?, ?, ?, ?, ?, ?) - ') - ->execute( - time(), - $path, - basename($absolutePath), - mb_substr($text, 0, 5000), - md5($path), - filemtime($absolutePath) ?: 0 - ); + // TITEL-PRIORITÄT + $title = + $linkText + ?: $metaTitle + ?: basename($absolutePath); + + $this->insertPdf( + $relativePath, + $title, + $text, + $checksum, + $mtime + ); + + error_log('→ geschrieben in tl_search_pdf'); } catch (\Throwable $e) { - error_log('PDF Fehler: ' . $e->getMessage()); + error_log('PDF Service FEHLER: ' . $e->getMessage()); } } } @@ -82,18 +95,22 @@ class PdfIndexService * ===================================================== */ private function normalizePdfUrl(string $url): ?string { - if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) { + // direkter /files-Link + if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) { return $url; } - $parts = parse_url(html_entity_decode($url)); + // Contao-Download-Link (?p=) + $decoded = html_entity_decode($url); + $parts = parse_url($decoded); + if (!isset($parts['query'])) { return null; } parse_str($parts['query'], $query); - if (!empty($query['p'])) { + if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) { return '/files/' . ltrim($query['p'], '/'); } @@ -101,16 +118,76 @@ class PdfIndexService } /* ===================================================== - * Textbereinigung + * DB + * ===================================================== */ + private function insertPdf( + string $url, + string $title, + string $text, + string $checksum, + int $mtime + ): void { + Database::getInstance() + ->prepare(' + INSERT INTO tl_search_pdf + (tstamp, url, title, text, checksum, file_mtime) + VALUES (?, ?, ?, ?, ?, ?) + ') + ->execute( + time(), + $url, + $title, + $text, + $checksum, + $mtime + ); + } + + /* ===================================================== + * PDF Parsing + * ===================================================== */ + private function parsePdf(string $absolutePath): array + { + try { + $parser = new Parser(); + $pdf = $parser->parseFile($absolutePath); + + $details = $pdf->getDetails(); + $metaTitle = $details['Title'] ?? null; + + $text = $this->cleanPdfContent($pdf->getText()); + + return [ + mb_substr($text, 0, 5000), + is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null, + ]; + + } catch (\Throwable $e) { + error_log('PDF Parser FEHLER: ' . $e->getMessage()); + return ['', null]; + } + } + + /* ===================================================== + * Text-Bereinigung * ===================================================== */ private function cleanPdfContent(string $text): string { + // Unicode normalisieren if (class_exists(\Normalizer::class)) { $text = \Normalizer::normalize($text, \Normalizer::FORM_C); } + // Sonderglyphen entfernen (Noten, Steuerzeichen etc.) $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); - $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text); + + // falsche Worttrennungen ("ges pielt") + $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text); + + // Apostrophe vereinheitlichen + $text = str_replace(["\\'", "’", "‘"], "'", $text); + + // Mehrfach-Leerzeichen $text = preg_replace('/\s+/u', ' ', $text); return trim($text);