diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 06ccc15..a91e745 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -30,22 +30,6 @@ class IndexPageListener 'set_keys' => array_keys($set), ]); - /* - * ===================== - * PDF: Reset genau 1× pro Crawl - * ===================== - */ - try { - $this->debug('PDF resetTableOnce(): call'); - $this->pdfIndexService->resetTableOnce(); - $this->debug('PDF resetTableOnce(): ok'); - } catch (\Throwable $e) { - $this->debug('PDF resetTableOnce(): failed', [ - 'error' => $e->getMessage(), - 'class' => $e::class, - ]); - } - /* * ===================== * SEITEN-METADATEN diff --git a/src/Resources/contao/dca/tl_search_pdf.php b/src/Resources/contao/dca/tl_search_pdf.php index ef174d5..3fb7f1b 100644 --- a/src/Resources/contao/dca/tl_search_pdf.php +++ b/src/Resources/contao/dca/tl_search_pdf.php @@ -8,10 +8,10 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [ 'sql' => [ 'keys' => [ 'id' => 'primary', - 'checksum' => 'unique', 'page_id' => 'index', - 'url' => 'index', + 'url' => 'unique', 'type' => 'index', + 'checksum' => 'index', 'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance) ], ], diff --git a/src/Service/OfficeIndexService.php b/src/Service/OfficeIndexService.php index a957e91..392f9ec 100644 --- a/src/Service/OfficeIndexService.php +++ b/src/Service/OfficeIndexService.php @@ -12,9 +12,6 @@ class OfficeIndexService { private string $projectDir; - // pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden - private array $seenThisCrawl = []; - public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); @@ -25,58 +22,71 @@ class OfficeIndexService */ public function handleOfficeLinks(array $officeLinks): void { + // Dedupe nur pro Aufruf (nicht "pro Crawl") + $seen = []; + $now = time(); + foreach ($officeLinks as $row) { - $url = (string) ($row['url'] ?? ''); + $url = (string) ($row['url'] ?? ''); $linkText = $row['linkText'] ?? null; if ($url === '') { continue; } - try { - // innerhalb des Crawls gleiche URL nicht mehrfach parsen - $seenKey = md5($url); - if (isset($this->seenThisCrawl[$seenKey])) { - continue; - } - $this->seenThisCrawl[$seenKey] = true; + // doppelte URLs pro Aufruf vermeiden + $seenKey = md5($url); + if (isset($seen[$seenKey])) { + continue; + } + $seen[$seenKey] = true; - $normalized = $this->normalizeOfficeUrl($url); - if ($normalized === null) { - continue; - } + $normalized = $this->normalizeOfficeUrl($url); + if ($normalized === null) { + continue; + } - [$relativePath, $type] = $normalized; + [$relativePath, $type] = $normalized; - $absolutePath = $this->getAbsolutePath($relativePath); - if (!is_file($absolutePath)) { - continue; - } + $absolutePath = $this->getAbsolutePath($relativePath); + if (!is_file($absolutePath)) { + continue; + } - $mtime = (int) (filemtime($absolutePath) ?: 0); - $checksum = md5($relativePath . '|' . $mtime); + $mtime = (int) (filemtime($absolutePath) ?: 0); + $checksum = md5($relativePath . '|' . $mtime); - $title = $linkText ?: basename($absolutePath); + // existiert bereits? + $existing = Database::getInstance() + ->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1') + ->execute($relativePath) + ->fetchAssoc(); + $needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum; + + // Titel-Priorität: + // 1) Linktext + // 2) Dateiname + $title = $linkText ?: basename($absolutePath); + $text = ''; + + if ($needsParse) { $text = $this->parseOfficeFile($absolutePath, $type); if ($text === '') { + // Parsing fehlgeschlagen → nichts überschreiben continue; } - - $this->upsertOffice( - $relativePath, - $title, - $text, - $checksum, - $mtime, - $type - ); - - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage() - ); } + + $this->upsertOffice( + $relativePath, + $title, + $text, // kann '' sein → SQL überschreibt dann nicht + $checksum, + $mtime, + $type, + $now + ); } } @@ -86,7 +96,11 @@ class OfficeIndexService private function normalizeOfficeUrl(string $url): ?array { $decoded = html_entity_decode($url); - $parts = parse_url($decoded); + $parts = parse_url($decoded); + + if (!$parts) { + return null; + } // 1) files/... (ohne führenden Slash) if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) { @@ -114,11 +128,11 @@ class OfficeIndexService if (!empty($query['file'])) { $file = urldecode((string) $query['file']); $file = ltrim($file, '/'); - $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION)); + $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION)); if ( - str_starts_with($file, 'files/') - && in_array($ext, ['docx', 'xlsx', 'pptx'], true) + str_starts_with($file, 'files/') && + in_array($ext, ['docx', 'xlsx', 'pptx'], true) ) { return ['/' . $file, $ext]; } @@ -126,7 +140,7 @@ class OfficeIndexService // 4) Contao 5: ?p=... if (!empty($query['p'])) { - $p = urldecode((string) $query['p']); + $p = urldecode((string) $query['p']); $ext = strtolower(pathinfo($p, PATHINFO_EXTENSION)); if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) { @@ -148,37 +162,35 @@ class OfficeIndexService string $text, string $checksum, int $mtime, - string $type + string $type, + int $now ): void { - try { - Database::getInstance() - ->prepare(' - INSERT INTO tl_search_pdf - (tstamp, type, url, title, text, checksum, file_mtime) - VALUES - (?, ?, ?, ?, ?, ?, ?) - ON DUPLICATE KEY UPDATE - tstamp=VALUES(tstamp), - type=VALUES(type), - url=VALUES(url), - title=VALUES(title), - text=VALUES(text), - file_mtime=VALUES(file_mtime) - ') - ->execute( - time(), - $type, - $url, - $title, - $text, - $checksum, - $mtime - ); - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage() + Database::getInstance() + ->prepare(' + INSERT INTO tl_search_pdf + (tstamp, last_seen, type, url, title, text, checksum, file_mtime) + VALUES + (?, ?, ?, ?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE + tstamp = VALUES(tstamp), + last_seen = VALUES(last_seen), + type = VALUES(type), + url = VALUES(url), + title = VALUES(title), + checksum = VALUES(checksum), + file_mtime = VALUES(file_mtime), + text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text)) + ') + ->execute( + $now, + $now, + $type, + $url, + $title, + $text, + $checksum, + $mtime ); - } } private function parseOfficeFile(string $absolutePath, string $type): string @@ -206,10 +218,7 @@ class OfficeIndexService } return $this->cleanText($text); - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage() - ); + } catch (\Throwable) { return ''; } } @@ -227,10 +236,7 @@ class OfficeIndexService } return $this->cleanText($text); - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage() - ); + } catch (\Throwable) { return ''; } } @@ -250,10 +256,7 @@ class OfficeIndexService } return $this->cleanText($text); - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage() - ); + } catch (\Throwable) { return ''; } } diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index 26522c6..05fd396 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -10,48 +10,34 @@ class PdfIndexService { private string $projectDir; - private bool $didReset = false; - private array $seenThisCrawl = []; - public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); } - /** - * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen. - */ - public function resetTableOnce(): void - { - if ($this->didReset) { - return; - } - - $this->didReset = true; - $this->seenThisCrawl = []; - - Database::getInstance()->execute('TRUNCATE tl_search_pdf'); - } - /** * @param array $pdfLinks */ public function handlePdfLinks(array $pdfLinks): void { + // Dedupe nur pro Aufruf (nicht "pro Crawl") + $seen = []; + $now = time(); + foreach ($pdfLinks as $row) { - $url = (string) ($row['url'] ?? ''); + $url = (string) ($row['url'] ?? ''); $linkText = $row['linkText'] ?? null; if ($url === '') { continue; } - // innerhalb eines Crawls doppelte URLs vermeiden + // doppelte URLs pro Aufruf vermeiden $seenKey = md5($url); - if (isset($this->seenThisCrawl[$seenKey])) { + if (isset($seen[$seenKey])) { continue; } - $this->seenThisCrawl[$seenKey] = true; + $seen[$seenKey] = true; $normalizedPath = $this->normalizePdfUrl($url); if ($normalizedPath === null) { @@ -63,27 +49,42 @@ class PdfIndexService continue; } - $mtime = (int) (filemtime($absolutePath) ?: 0); + $mtime = (int) (filemtime($absolutePath) ?: 0); $checksum = md5($normalizedPath . '|' . $mtime); + // existiert bereits? + $existing = Database::getInstance() + ->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1') + ->execute($normalizedPath) + ->fetchAssoc(); + + $needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum; + // Titel-Priorität: // 1) Linktext // 2) PDF-Metadaten // 3) Dateiname - $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); - $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); + $title = $linkText ?: basename($absolutePath); + $text = ''; - $text = $this->parsePdf($absolutePath); - if ($text === '') { - continue; + if ($needsParse) { + $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); + $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); + + $text = $this->parsePdf($absolutePath); + if ($text === '') { + // wenn parsing fehlschlägt, NICHT überschreiben + continue; + } } $this->upsertPdf( $normalizedPath, $title, - $text, + $text, // kann '' sein → wird in SQL nicht überschrieben $checksum, - $mtime + $mtime, + $now ); } } @@ -91,7 +92,11 @@ class PdfIndexService private function normalizePdfUrl(string $url): ?string { $decoded = html_entity_decode($url); - $parts = parse_url($decoded); + $parts = parse_url($decoded); + + if (!$parts) { + return null; + } // 1) files/...pdf (ohne führenden Slash) if ( @@ -149,23 +154,29 @@ class PdfIndexService string $title, string $text, string $checksum, - int $mtime + int $mtime, + int $now ): void { Database::getInstance() ->prepare(' INSERT INTO tl_search_pdf - (tstamp, url, title, text, checksum, file_mtime) + (tstamp, last_seen, type, url, title, text, checksum, file_mtime) VALUES - (?, ?, ?, ?, ?, ?) + (?, ?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE - tstamp=VALUES(tstamp), - url=VALUES(url), - title=VALUES(title), - text=VALUES(text), - file_mtime=VALUES(file_mtime) + tstamp = VALUES(tstamp), + last_seen = VALUES(last_seen), + type = VALUES(type), + url = VALUES(url), + title = VALUES(title), + checksum = VALUES(checksum), + file_mtime = VALUES(file_mtime), + text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text)) ') ->execute( - time(), + $now, + $now, + 'pdf', $url, $title, $text, @@ -178,8 +189,8 @@ class PdfIndexService { try { $parser = new Parser(); - $pdf = $parser->parseFile($absolutePath); - $text = $this->cleanPdfContent($pdf->getText()); + $pdf = $parser->parseFile($absolutePath); + $text = $this->cleanPdfContent($pdf->getText()); return mb_substr($text, 0, 20000); } catch (\Throwable) { @@ -190,8 +201,8 @@ class PdfIndexService private function readPdfMetaTitle(string $absolutePath): ?string { try { - $parser = new Parser(); - $pdf = $parser->parseFile($absolutePath); + $parser = new Parser(); + $pdf = $parser->parseFile($absolutePath); $details = $pdf->getDetails(); foreach (['Title', 'title'] as $key) { @@ -203,6 +214,7 @@ class PdfIndexService } } } catch (\Throwable) { + // ignore } return null;