From 9edb1e4713935d01d693b19bc37e6f58691be690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Sun, 28 Dec 2025 11:29:13 +0100 Subject: [PATCH] Bugfix --- src/EventListener/IndexPageListener.php | 178 ++++++++++++------- src/Service/PdfIndexService.php | 223 ++++++++++++++---------- 2 files changed, 241 insertions(+), 160 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 4fb0dc5..86733c9 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -15,18 +15,92 @@ class IndexPageListener public function onIndexPage(string $content, array &$data, array &$set): void { - fwrite(STDERR, "\n[Meili DEBUG] onIndexPage() called\n"); - /* * ===================== * PDF: Reset genau 1× pro Crawl * ===================== */ try { - fwrite(STDERR, "[Meili DEBUG] resetTableOnce()\n"); $this->pdfIndexService->resetTableOnce(); } catch (\Throwable $e) { - fwrite(STDERR, "[Meili DEBUG] PDF reset failed: {$e->getMessage()}\n"); + error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage()); + } + + /* + * ===================== + * SEITEN-METADATEN + * ===================== + */ + if (str_contains($content, 'MEILISEARCH_JSON')) { + try { + $parsed = $this->extractMeilisearchJson($content); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to extract MEILISEARCH_JSON: ' . $e->getMessage()); + $parsed = null; + } + + if (is_array($parsed)) { + + // PRIORITY + $priority = + $parsed['event']['priority'] + ?? $parsed['news']['priority'] + ?? $parsed['page']['priority'] + ?? null; + + if ($priority !== null && $priority !== '') { + $set['priority'] = (int) $priority; + } + + // KEYWORDS + $keywordSources = [ + $parsed['event']['keywords'] ?? null, + $parsed['news']['keywords'] ?? null, + $parsed['page']['keywords'] ?? null, + ]; + + $keywords = []; + foreach ($keywordSources as $src) { + if (!is_string($src) || trim($src) === '') { + continue; + } + foreach (preg_split('/\s+/', trim($src)) as $word) { + $keywords[] = $word; + } + } + + if ($keywords) { + $set['keywords'] = implode(' ', array_unique($keywords)); + } + + // IMAGEPATH + if (!empty($parsed['page']['searchimage'])) { + $set['imagepath'] = trim((string) $parsed['page']['searchimage']); + } + + // STARTDATE + $startDate = + $parsed['event']['startDate'] + ?? $parsed['news']['startDate'] + ?? null; + + if (is_numeric($startDate) && (int) $startDate > 0) { + $set['startDate'] = (int) $startDate; + } + + // CHECKSUM + try { + $checksumSeed = (string) ($data['checksum'] ?? ''); + $checksumSeed .= '|' . ($set['keywords'] ?? ''); + $checksumSeed .= '|' . ($set['priority'] ?? ''); + $checksumSeed .= '|' . ($set['imagepath'] ?? ''); + $checksumSeed .= '|' . ($set['startDate'] ?? ''); + + $set['checksum'] = md5($checksumSeed); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to generate checksum: ' . $e->getMessage()); + } + } } /* @@ -35,91 +109,67 @@ class IndexPageListener * ===================== */ if ((int) ($data['protected'] ?? 0) !== 0) { - fwrite(STDERR, "[Meili DEBUG] Page is protected → skip files\n"); return; } $indexPdfs = (bool) Config::get('meilisearch_index_pdfs'); $indexOffice = (bool) Config::get('meilisearch_index_office'); - fwrite( - STDERR, - "[Meili DEBUG] Settings: pdfs=" - . ($indexPdfs ? '1' : '0') - . " office=" - . ($indexOffice ? '1' : '0') - . "\n" - ); - if (!$indexPdfs && !$indexOffice) { - fwrite(STDERR, "[Meili DEBUG] No file indexing enabled → return\n"); return; } $links = $this->findAllLinks($content); - fwrite(STDERR, "[Meili DEBUG] Found " . count($links) . " links\n"); $pdfLinks = []; $officeLinks = []; foreach ($links as $link) { - fwrite(STDERR, "[Meili DEBUG] URL: {$link['url']}\n"); - $type = $this->detectIndexableFileType($link['url']); - fwrite( - STDERR, - "[Meili DEBUG] → detected type: " - . ($type ?? 'none') - . "\n" - ); - if ($type === 'pdf') { - if ($indexPdfs) { - fwrite(STDERR, "[Meili DEBUG] → add to PDF queue\n"); - $pdfLinks[] = $link; - } else { - fwrite(STDERR, "[Meili DEBUG] → PDF indexing disabled\n"); - } + if ($type === 'pdf' && $indexPdfs) { + $pdfLinks[] = $link; continue; } - if (in_array($type, ['docx', 'xlsx', 'pptx'], true)) { - if ($indexOffice) { - fwrite(STDERR, "[Meili DEBUG] → add to OFFICE queue\n"); - $officeLinks[] = $link; - } else { - fwrite(STDERR, "[Meili DEBUG] → Office indexing disabled\n"); - } - continue; + if ( + in_array($type, ['docx', 'xlsx', 'pptx'], true) + && $indexOffice + ) { + $officeLinks[] = $link; } - - fwrite(STDERR, "[Meili DEBUG] → ignored\n"); } - fwrite( - STDERR, - "[Meili DEBUG] Final queues: pdf=" - . count($pdfLinks) - . " office=" - . count($officeLinks) - . "\n" - ); - try { if ($pdfLinks !== []) { - fwrite(STDERR, "[Meili DEBUG] Calling handlePdfLinks()\n"); $this->pdfIndexService->handlePdfLinks($pdfLinks); } if ($officeLinks !== []) { - fwrite(STDERR, "[Meili DEBUG] Calling handleOfficeLinks()\n"); $this->officeIndexService->handleOfficeLinks($officeLinks); } } catch (\Throwable $e) { - fwrite(STDERR, "[Meili DEBUG] File indexing failed: {$e->getMessage()}\n"); + error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage()); } } + /** + * Extrahiert MEILISEARCH_JSON aus HTML-Kommentar + */ + private function extractMeilisearchJson(string $content): ?array + { + if (!preg_match('//s', $content, $m)) { + return null; + } + + $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1])); + $data = json_decode($json, true); + + return json_last_error() === JSON_ERROR_NONE && is_array($data) + ? $data + : null; + } + /** * Sammle alle Links */ @@ -150,39 +200,35 @@ class IndexPageListener */ private function detectIndexableFileType(string $url): ?string { - fwrite(STDERR, "[Meili DEBUG] detectIndexableFileType(): $url\n"); - + // Hash entfernen $url = strtok($url, '#'); - $parts = parse_url($url); + $parts = parse_url($url); if (!$parts) { - fwrite(STDERR, "[Meili DEBUG] → parse_url failed\n"); return null; } + // direkter Pfad (/files/…) if (!empty($parts['path'])) { $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); - fwrite(STDERR, "[Meili DEBUG] → path ext: $ext\n"); - if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { return $ext; } } + // Query-Parameter (Contao 4 + 5) if (!empty($parts['query'])) { parse_str($parts['query'], $query); foreach (['file', 'p', 'f'] as $param) { if (!empty($query[$param])) { - $candidate = rawurldecode( - html_entity_decode((string) $query[$param], ENT_QUOTES) - ); + $candidate = (string) $query[$param]; + + // sicher decodieren (Contao 4 + 5) + $candidate = html_entity_decode($candidate, ENT_QUOTES); + $candidate = rawurldecode($candidate); $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); - fwrite( - STDERR, - "[Meili DEBUG] → query $param=$candidate ext=$ext\n" - ); if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { return $ext; diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index f689c91..57fa73e 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -10,119 +10,148 @@ class PdfIndexService { private string $projectDir; - // pro PHP-Process genau 1x resetten private bool $didReset = false; - - // pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden private array $seenThisCrawl = []; public function __construct(ParameterBagInterface $params) { $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); + fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n"); } - /** - * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen. - */ public function resetTableOnce(): void { if ($this->didReset) { + fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n"); return; } + fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n"); + $this->didReset = true; $this->seenThisCrawl = []; try { Database::getInstance()->execute('TRUNCATE tl_search_pdf'); } catch (\Throwable $e) { - error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage()); + fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n"); } } - /** - * @param array $pdfLinks - */ public function handlePdfLinks(array $pdfLinks): void { + fwrite( + STDERR, + "[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n" + ); + foreach ($pdfLinks as $row) { $url = (string) ($row['url'] ?? ''); $linkText = $row['linkText'] ?? null; + fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n"); + if ($url === '') { + fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n"); continue; } - try { - // innerhalb des Crawls gleiche URL nicht mehrfach parsen - $seenKey = md5($url); - if (isset($this->seenThisCrawl[$seenKey])) { - continue; - } - $this->seenThisCrawl[$seenKey] = true; - - $normalizedPath = $this->normalizePdfUrl($url); - if ($normalizedPath === null) { - continue; - } - - $absolutePath = $this->getAbsolutePath($normalizedPath); - if (!is_file($absolutePath)) { - continue; - } - - $mtime = (int) (filemtime($absolutePath) ?: 0); - $checksum = md5($normalizedPath . '|' . $mtime); - - // Titel-Priorität: - // 1) Linktext - // 2) PDF-Metadaten Title - // 3) Dateiname - $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); - $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); - - $text = $this->parsePdf($absolutePath); - if ($text === '') { - continue; - } - - $this->upsertPdf( - $normalizedPath, - $title, - $text, - $checksum, - $mtime - ); - - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage() - ); + $seenKey = md5($url); + if (isset($this->seenThisCrawl[$seenKey])) { + fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n"); + continue; } + $this->seenThisCrawl[$seenKey] = true; + + $normalizedPath = $this->normalizePdfUrl($url); + fwrite( + STDERR, + "[Meili PDF DEBUG] normalizePdfUrl() → " + . ($normalizedPath ?? 'NULL') + . "\n" + ); + + if ($normalizedPath === null) { + fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n"); + continue; + } + + $absolutePath = $this->getAbsolutePath($normalizedPath); + fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n"); + + if (!is_file($absolutePath)) { + fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n"); + continue; + } + + fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n"); + + $mtime = (int) (filemtime($absolutePath) ?: 0); + $checksum = md5($normalizedPath . '|' . $mtime); + + fwrite( + STDERR, + "[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n" + ); + + $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); + fwrite( + STDERR, + "[Meili PDF DEBUG] metaTitle=" + . ($pdfMetaTitle ?: 'NULL') + . "\n" + ); + + $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); + fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n"); + + $text = $this->parsePdf($absolutePath); + fwrite( + STDERR, + "[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n" + ); + + if ($text === '') { + fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n"); + continue; + } + + fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n"); + + $this->upsertPdf( + $normalizedPath, + $title, + $text, + $checksum, + $mtime + ); } } private function normalizePdfUrl(string $url): ?string { - // Fall 1: direkter /files/-Pfad + fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n"); + if (str_starts_with($url, '/files/') && preg_match('~\.pdf(\?.*)?$~i', $url)) { - return preg_replace('~\?.*$~', '', $url); + $r = preg_replace('~\?.*$~', '', $url); + fwrite(STDERR, "[Meili PDF DEBUG] → direct /files path {$r}\n"); + return $r; } $decoded = html_entity_decode($url); $parts = parse_url($decoded); - // Fall 2: absolute URL auf gleiche Site if ( !empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf') ) { + fwrite(STDERR, "[Meili PDF DEBUG] → absolute URL path {$parts['path']}\n"); return $parts['path']; } - // Fall 3: Contao-Download-Link mit ?p= if (empty($parts['query'])) { + fwrite(STDERR, "[Meili PDF DEBUG] → no query\n"); return null; } @@ -130,9 +159,12 @@ class PdfIndexService if (!empty($query['p'])) { $p = urldecode((string) $query['p']); - return '/files/' . ltrim($p, '/'); + $r = '/files/' . ltrim($p, '/'); + fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n"); + return $r; } + fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n"); return null; } @@ -141,8 +173,13 @@ class PdfIndexService return $this->projectDir . '/' . ltrim($relativePath, '/'); } - private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void - { + private function upsertPdf( + string $url, + string $title, + string $text, + string $checksum, + int $mtime + ): void { try { Database::getInstance() ->prepare(' @@ -165,9 +202,12 @@ class PdfIndexService $checksum, $mtime ); + + fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n"); } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage() + fwrite( + STDERR, + "[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n" ); } } @@ -177,18 +217,39 @@ class PdfIndexService try { $parser = new Parser(); $pdf = $parser->parseFile($absolutePath); - $text = $this->cleanPdfContent($pdf->getText()); - return mb_substr($text, 0, 20000); } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage() + fwrite( + STDERR, + "[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n" ); return ''; } } + private function readPdfMetaTitle(string $absolutePath): ?string + { + try { + $parser = new Parser(); + $pdf = $parser->parseFile($absolutePath); + $details = $pdf->getDetails(); + + foreach (['Title', 'title'] as $key) { + if (!empty($details[$key]) && is_string($details[$key])) { + return trim($details[$key]); + } + } + } catch (\Throwable $e) { + fwrite( + STDERR, + "[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n" + ); + } + + return null; + } + private function cleanPdfContent(string $text): string { if (class_exists(\Normalizer::class)) { @@ -198,34 +259,8 @@ class PdfIndexService $text = str_replace(["\r\n", "\r"], "\n", $text); $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text); - $text = str_replace(["\\'", "’", "‘"], "'", $text); $text = preg_replace('/\s+/u', ' ', $text); return trim($text); } - - private function readPdfMetaTitle(string $absolutePath): ?string - { - try { - $parser = new Parser(); - $pdf = $parser->parseFile($absolutePath); - - $details = $pdf->getDetails(); - - foreach (['Title', 'title'] as $key) { - if (!empty($details[$key]) && is_string($details[$key])) { - $t = trim($details[$key]); - if ($t !== '') { - return $t; - } - } - } - } catch (\Throwable $e) { - error_log( - '[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage() - ); - } - - return null; - } } \ No newline at end of file