From 85cbf2a235b7d412efad8da34ff46b35f90348e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Thu, 25 Dec 2025 15:53:35 +0100 Subject: [PATCH] Bugfix --- src/EventListener/IndexPageListener.php | 28 ++--- src/Service/PdfIndexService.php | 132 ++++++++---------------- 2 files changed, 52 insertions(+), 108 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index a17f7af..6568177 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -11,6 +11,12 @@ class IndexPageListener public function onIndexPage(string $content, array &$data, array &$set): void { + // 🔑 Service einmal pro Crawl initialisieren + Reset ausführen + if ($this->pdfIndexService === null) { + $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class); + $this->pdfIndexService->resetTableOnce(); + } + // Marker vorhanden? if (!str_contains($content, 'MEILISEARCH_JSON')) { return; @@ -53,9 +59,7 @@ class IndexPageListener } foreach (preg_split('/\s+/', trim($src)) as $word) { - if ($word !== '') { - $keywords[] = $word; - } + $keywords[] = $word; } } @@ -101,17 +105,9 @@ class IndexPageListener */ $pdfLinks = $this->findPdfLinks($content); - if ($pdfLinks === []) { - return; + if ($pdfLinks !== []) { + $this->pdfIndexService->handlePdfLinks($pdfLinks); } - - // Service lazy aus Container holen - if ($this->pdfIndexService === null) { - $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class); - $this->pdfIndexService->resetTableOnce(); - } - - $this->pdfIndexService->handlePdfLinks($pdfLinks); } /* ===================================================== @@ -130,7 +126,7 @@ class IndexPageListener } /* ===================================================== - * PDF-Links im Markup finden + * PDF-Links im HTML finden * ===================================================== */ private function findPdfLinks(string $content): array { @@ -142,8 +138,6 @@ class IndexPageListener return []; } - return array_unique( - array_map('html_entity_decode', $matches[1]) - ); + return array_unique(array_map('html_entity_decode', $matches[1])); } } \ No newline at end of file diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index 395f0b7..3f826f8 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -8,17 +8,17 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface; class PdfIndexService { - private string $projectDir; private bool $tableReset = false; + private string $projectDir; public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim($params->get('kernel.project_dir'), '/'); } - /* ===================================================== - * Reset tl_search_pdf einmal pro Crawl - * ===================================================== */ + /** + * 🔥 Wird bei JEDEM Crawl einmal aufgerufen + */ public function resetTableOnce(): void { if ($this->tableReset) { @@ -26,50 +26,53 @@ class PdfIndexService } Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); - $this->tableReset = true; + error_log('tl_search_pdf wurde geleert'); - error_log('PDF Reset: tl_search_pdf geleert'); + $this->tableReset = true; } - /* ===================================================== - * Einstiegspunkt aus Listener - * ===================================================== */ + /** + * Einstiegspunkt vom Listener + */ public function handlePdfLinks(array $pdfLinks): void { foreach ($pdfLinks as $url) { try { - $normalizedPath = $this->normalizePdfUrl($url); - if ($normalizedPath === null) { + $path = $this->normalizePdfUrl($url); + if ($path === null) { continue; } - $absolutePath = $this->getAbsolutePath($normalizedPath); + $absolutePath = $this->projectDir . '/' . ltrim($path, '/'); if (!is_file($absolutePath)) { continue; } - $mtime = filemtime($absolutePath) ?: 0; - $checksum = md5($normalizedPath . $mtime); + $parser = new Parser(); + $pdf = $parser->parseFile($absolutePath); + $text = $this->cleanPdfContent($pdf->getText()); - if ($this->alreadyIndexed($checksum)) { - continue; - } - - $text = $this->parsePdf($absolutePath); if ($text === '') { continue; } - $this->insertPdf( - $normalizedPath, - basename($absolutePath), - $text, - $checksum, - $mtime - ); + Database::getInstance() + ->prepare(' + INSERT INTO tl_search_pdf + (tstamp, url, title, text, checksum, file_mtime) + VALUES (?, ?, ?, ?, ?, ?) + ') + ->execute( + time(), + $path, + basename($absolutePath), + mb_substr($text, 0, 5000), + md5($path), + filemtime($absolutePath) ?: 0 + ); } catch (\Throwable $e) { - error_log('PDF Service Fehler: ' . $e->getMessage()); + error_log('PDF Fehler: ' . $e->getMessage()); } } } @@ -83,9 +86,7 @@ class PdfIndexService return $url; } - $decoded = html_entity_decode($url); - $parts = parse_url($decoded); - + $parts = parse_url(html_entity_decode($url)); if (!isset($parts['query'])) { return null; } @@ -100,69 +101,18 @@ class PdfIndexService } /* ===================================================== - * Pfade + * Textbereinigung * ===================================================== */ - private function getAbsolutePath(string $relativePath): string + private function cleanPdfContent(string $text): string { - return $this->projectDir . '/' . ltrim($relativePath, '/'); - } - - /* ===================================================== - * DB - * ===================================================== */ - private function alreadyIndexed(string $checksum): bool - { - $result = Database::getInstance() - ->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?') - ->execute($checksum); - - return $result->numRows > 0; - } - - private function insertPdf( - string $path, - string $title, - string $text, - string $checksum, - int $mtime - ): void { - Database::getInstance() - ->prepare(' - INSERT INTO tl_search_pdf - (tstamp, url, title, text, checksum, file_mtime) - VALUES (?, ?, ?, ?, ?, ?) - ') - ->execute( - time(), - $path, - $title, - $text, - $checksum, - $mtime - ); - } - - /* ===================================================== - * PDF-Parsing - * ===================================================== */ - private function parsePdf(string $absolutePath): string - { - try { - $parser = new Parser(); - $pdf = $parser->parseFile($absolutePath); - - $text = $pdf->getText(); - - if (class_exists(\Normalizer::class)) { - $text = \Normalizer::normalize($text, \Normalizer::FORM_C); - } - - $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}]/u', ' ', $text); - $text = preg_replace('/\s+/u', ' ', $text); - - return trim(mb_substr($text, 0, 5000)); - } catch (\Throwable) { - return ''; + if (class_exists(\Normalizer::class)) { + $text = \Normalizer::normalize($text, \Normalizer::FORM_C); } + + $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); + $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text); + $text = preg_replace('/\s+/u', ' ', $text); + + return trim($text); } } \ No newline at end of file