From 04f3e76c8f0d1b94ba278cd9b2b483584288f46b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Sun, 28 Dec 2025 11:56:23 +0100 Subject: [PATCH] Bugfix --- src/Service/PdfIndexService.php | 152 +++++++++++++------------------- 1 file changed, 61 insertions(+), 91 deletions(-) diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index 254785d..26522c6 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -16,98 +16,68 @@ class PdfIndexService public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); - $this->debug("projectDir={$this->projectDir}"); - } - - private function debug(string $message): void - { - $stream = \defined('STDERR') - ? STDERR - : fopen('php://stderr', 'wb'); - - fwrite($stream, "[Meili PDF DEBUG] {$message}\n"); } + /** + * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen. + */ public function resetTableOnce(): void { if ($this->didReset) { - $this->debug('resetTableOnce(): already reset'); return; } - $this->debug('resetTableOnce(): TRUNCATE tl_search_pdf'); - $this->didReset = true; $this->seenThisCrawl = []; - try { - Database::getInstance()->execute('TRUNCATE tl_search_pdf'); - } catch (\Throwable $e) { - $this->debug('TRUNCATE failed: ' . $e->getMessage()); - } + Database::getInstance()->execute('TRUNCATE tl_search_pdf'); } + /** + * @param array $pdfLinks + */ public function handlePdfLinks(array $pdfLinks): void { - $this->debug('handlePdfLinks(): count=' . count($pdfLinks)); - foreach ($pdfLinks as $row) { $url = (string) ($row['url'] ?? ''); $linkText = $row['linkText'] ?? null; - $this->debug("URL={$url}"); - if ($url === '') { - $this->debug('→ empty URL, skip'); continue; } + // innerhalb eines Crawls doppelte URLs vermeiden $seenKey = md5($url); if (isset($this->seenThisCrawl[$seenKey])) { - $this->debug('→ already processed, skip'); continue; } $this->seenThisCrawl[$seenKey] = true; $normalizedPath = $this->normalizePdfUrl($url); - $this->debug('normalizePdfUrl() → ' . ($normalizedPath ?? 'NULL')); - if ($normalizedPath === null) { - $this->debug('→ normalization failed, skip'); continue; } $absolutePath = $this->getAbsolutePath($normalizedPath); - $this->debug("absolutePath={$absolutePath}"); - if (!is_file($absolutePath)) { - $this->debug('→ file does NOT exist'); continue; } - $this->debug('→ file exists'); - $mtime = (int) (filemtime($absolutePath) ?: 0); $checksum = md5($normalizedPath . '|' . $mtime); - $this->debug("mtime={$mtime} checksum={$checksum}"); - + // Titel-Priorität: + // 1) Linktext + // 2) PDF-Metadaten + // 3) Dateiname $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); - $this->debug('metaTitle=' . ($pdfMetaTitle ?: 'NULL')); - $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); - $this->debug("final title={$title}"); $text = $this->parsePdf($absolutePath); - $this->debug('parsed text length=' . strlen($text)); - if ($text === '') { - $this->debug('→ empty text, skip'); continue; } - $this->debug('→ writing to DB'); - $this->upsertPdf( $normalizedPath, $title, @@ -120,48 +90,52 @@ class PdfIndexService private function normalizePdfUrl(string $url): ?string { - $this->debug("normalizePdfUrl(): {$url}"); - $decoded = html_entity_decode($url); $parts = parse_url($decoded); - if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/') && str_ends_with(strtolower($parts['path']), '.pdf')) { - $r = '/' . $parts['path']; - $this->debug("→ relative files path {$r}"); - return $r; + // 1) files/...pdf (ohne führenden Slash) + if ( + !empty($parts['path']) + && str_starts_with($parts['path'], 'files/') + && str_ends_with(strtolower($parts['path']), '.pdf') + ) { + return '/' . $parts['path']; } - if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) { - $this->debug("→ absolute files path {$parts['path']}"); + // 2) /files/...pdf + if ( + !empty($parts['path']) + && str_starts_with($parts['path'], '/files/') + && str_ends_with(strtolower($parts['path']), '.pdf') + ) { return $parts['path']; } if (empty($parts['query'])) { - $this->debug('→ no query'); return null; } parse_str($parts['query'], $query); + // 3) Contao 4: ?file=files/... if (!empty($query['file'])) { $file = urldecode((string) $query['file']); $file = ltrim($file, '/'); - if (str_starts_with($file, 'files/') && str_ends_with(strtolower($file), '.pdf')) { - $r = '/' . $file; - $this->debug("→ file= normalized {$r}"); - return $r; + if ( + str_starts_with($file, 'files/') + && str_ends_with(strtolower($file), '.pdf') + ) { + return '/' . $file; } } + // 4) Contao 5: ?p=... if (!empty($query['p'])) { $p = urldecode((string) $query['p']); - $r = '/files/' . ltrim($p, '/'); - $this->debug("→ p= normalized {$r}"); - return $r; + return '/files/' . ltrim($p, '/'); } - $this->debug('→ no usable parameter'); return null; } @@ -177,33 +151,27 @@ class PdfIndexService string $checksum, int $mtime ): void { - try { - Database::getInstance() - ->prepare(' - INSERT INTO tl_search_pdf - (tstamp, url, title, text, checksum, file_mtime) - VALUES - (?, ?, ?, ?, ?, ?) - ON DUPLICATE KEY UPDATE - tstamp=VALUES(tstamp), - url=VALUES(url), - title=VALUES(title), - text=VALUES(text), - file_mtime=VALUES(file_mtime) - ') - ->execute( - time(), - $url, - $title, - $text, - $checksum, - $mtime - ); - - $this->debug('→ DB write OK'); - } catch (\Throwable $e) { - $this->debug('DB write failed: ' . $e->getMessage()); - } + Database::getInstance() + ->prepare(' + INSERT INTO tl_search_pdf + (tstamp, url, title, text, checksum, file_mtime) + VALUES + (?, ?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE + tstamp=VALUES(tstamp), + url=VALUES(url), + title=VALUES(title), + text=VALUES(text), + file_mtime=VALUES(file_mtime) + ') + ->execute( + time(), + $url, + $title, + $text, + $checksum, + $mtime + ); } private function parsePdf(string $absolutePath): string @@ -212,9 +180,9 @@ class PdfIndexService $parser = new Parser(); $pdf = $parser->parseFile($absolutePath); $text = $this->cleanPdfContent($pdf->getText()); + return mb_substr($text, 0, 20000); - } catch (\Throwable $e) { - $this->debug('parsePdf failed: ' . $e->getMessage()); + } catch (\Throwable) { return ''; } } @@ -228,11 +196,13 @@ class PdfIndexService foreach (['Title', 'title'] as $key) { if (!empty($details[$key]) && is_string($details[$key])) { - return trim($details[$key]); + $t = trim($details[$key]); + if ($t !== '') { + return $t; + } } } - } catch (\Throwable $e) { - $this->debug('readPdfMetaTitle failed: ' . $e->getMessage()); + } catch (\Throwable) { } return null;