From 9edb1e4713935d01d693b19bc37e6f58691be690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= <juergen@MacBookPro.fritz.box>
Date: Sun, 28 Dec 2025 11:29:13 +0100
Subject: [PATCH] Bugfix

---
 src/EventListener/IndexPageListener.php | 178 ++++++++++++-------
 src/Service/PdfIndexService.php         | 223 ++++++++++++++----------
 2 files changed, 241 insertions(+), 160 deletions(-)

diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index 4fb0dc5..86733c9 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -15,18 +15,92 @@ class IndexPageListener
 
     public function onIndexPage(string $content, array &$data, array &$set): void
     {
-        fwrite(STDERR, "\n[Meili DEBUG] onIndexPage() called\n");
-
         /*
          * =====================
          * PDF: Reset genau 1× pro Crawl
          * =====================
          */
         try {
-            fwrite(STDERR, "[Meili DEBUG] resetTableOnce()\n");
             $this->pdfIndexService->resetTableOnce();
         } catch (\Throwable $e) {
-            fwrite(STDERR, "[Meili DEBUG] PDF reset failed: {$e->getMessage()}\n");
+            error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
+        }
+
+        /*
+         * =====================
+         * SEITEN-METADATEN
+         * =====================
+         */
+        if (str_contains($content, 'MEILISEARCH_JSON')) {
+            try {
+                $parsed = $this->extractMeilisearchJson($content);
+            } catch (\Throwable $e) {
+                error_log('[ContaoMeilisearch] Failed to extract MEILISEARCH_JSON: ' . $e->getMessage());
+                $parsed = null;
+            }
+
+            if (is_array($parsed)) {
+
+                // PRIORITY
+                $priority =
+                    $parsed['event']['priority']
+                    ?? $parsed['news']['priority']
+                    ?? $parsed['page']['priority']
+                    ?? null;
+
+                if ($priority !== null && $priority !== '') {
+                    $set['priority'] = (int) $priority;
+                }
+
+                // KEYWORDS
+                $keywordSources = [
+                    $parsed['event']['keywords'] ?? null,
+                    $parsed['news']['keywords']  ?? null,
+                    $parsed['page']['keywords']  ?? null,
+                ];
+
+                $keywords = [];
+                foreach ($keywordSources as $src) {
+                    if (!is_string($src) || trim($src) === '') {
+                        continue;
+                    }
+                    foreach (preg_split('/\s+/', trim($src)) as $word) {
+                        $keywords[] = $word;
+                    }
+                }
+
+                if ($keywords) {
+                    $set['keywords'] = implode(' ', array_unique($keywords));
+                }
+
+                // IMAGEPATH
+                if (!empty($parsed['page']['searchimage'])) {
+                    $set['imagepath'] = trim((string) $parsed['page']['searchimage']);
+                }
+
+                // STARTDATE
+                $startDate =
+                    $parsed['event']['startDate']
+                    ?? $parsed['news']['startDate']
+                    ?? null;
+
+                if (is_numeric($startDate) && (int) $startDate > 0) {
+                    $set['startDate'] = (int) $startDate;
+                }
+
+                // CHECKSUM
+                try {
+                    $checksumSeed  = (string) ($data['checksum'] ?? '');
+                    $checksumSeed .= '|' . ($set['keywords']  ?? '');
+                    $checksumSeed .= '|' . ($set['priority']  ?? '');
+                    $checksumSeed .= '|' . ($set['imagepath'] ?? '');
+                    $checksumSeed .= '|' . ($set['startDate'] ?? '');
+
+                    $set['checksum'] = md5($checksumSeed);
+                } catch (\Throwable $e) {
+                    error_log('[ContaoMeilisearch] Failed to generate checksum: ' . $e->getMessage());
+                }
+            }
         }
 
         /*
@@ -35,91 +109,67 @@ class IndexPageListener
          * =====================
          */
         if ((int) ($data['protected'] ?? 0) !== 0) {
-            fwrite(STDERR, "[Meili DEBUG] Page is protected → skip files\n");
             return;
         }
 
         $indexPdfs   = (bool) Config::get('meilisearch_index_pdfs');
         $indexOffice = (bool) Config::get('meilisearch_index_office');
 
-        fwrite(
-            STDERR,
-            "[Meili DEBUG] Settings: pdfs="
-            . ($indexPdfs ? '1' : '0')
-            . " office="
-            . ($indexOffice ? '1' : '0')
-            . "\n"
-        );
-
         if (!$indexPdfs && !$indexOffice) {
-            fwrite(STDERR, "[Meili DEBUG] No file indexing enabled → return\n");
             return;
         }
 
         $links = $this->findAllLinks($content);
-        fwrite(STDERR, "[Meili DEBUG] Found " . count($links) . " <a> links\n");
 
         $pdfLinks    = [];
         $officeLinks = [];
 
         foreach ($links as $link) {
-            fwrite(STDERR, "[Meili DEBUG] URL: {$link['url']}\n");
-
             $type = $this->detectIndexableFileType($link['url']);
-            fwrite(
-                STDERR,
-                "[Meili DEBUG]  → detected type: "
-                . ($type ?? 'none')
-                . "\n"
-            );
 
-            if ($type === 'pdf') {
-                if ($indexPdfs) {
-                    fwrite(STDERR, "[Meili DEBUG]  → add to PDF queue\n");
-                    $pdfLinks[] = $link;
-                } else {
-                    fwrite(STDERR, "[Meili DEBUG]  → PDF indexing disabled\n");
-                }
+            if ($type === 'pdf' && $indexPdfs) {
+                $pdfLinks[] = $link;
                 continue;
             }
 
-            if (in_array($type, ['docx', 'xlsx', 'pptx'], true)) {
-                if ($indexOffice) {
-                    fwrite(STDERR, "[Meili DEBUG]  → add to OFFICE queue\n");
-                    $officeLinks[] = $link;
-                } else {
-                    fwrite(STDERR, "[Meili DEBUG]  → Office indexing disabled\n");
-                }
-                continue;
+            if (
+                in_array($type, ['docx', 'xlsx', 'pptx'], true)
+                && $indexOffice
+            ) {
+                $officeLinks[] = $link;
             }
-
-            fwrite(STDERR, "[Meili DEBUG]  → ignored\n");
         }
 
-        fwrite(
-            STDERR,
-            "[Meili DEBUG] Final queues: pdf="
-            . count($pdfLinks)
-            . " office="
-            . count($officeLinks)
-            . "\n"
-        );
-
         try {
             if ($pdfLinks !== []) {
-                fwrite(STDERR, "[Meili DEBUG] Calling handlePdfLinks()\n");
                 $this->pdfIndexService->handlePdfLinks($pdfLinks);
             }
 
             if ($officeLinks !== []) {
-                fwrite(STDERR, "[Meili DEBUG] Calling handleOfficeLinks()\n");
                 $this->officeIndexService->handleOfficeLinks($officeLinks);
             }
         } catch (\Throwable $e) {
-            fwrite(STDERR, "[Meili DEBUG] File indexing failed: {$e->getMessage()}\n");
+            error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage());
         }
     }
 
+    /**
+     * Extrahiert MEILISEARCH_JSON aus HTML-Kommentar
+     */
+    private function extractMeilisearchJson(string $content): ?array
+    {
+        if (!preg_match('/<!--\s*MEILISEARCH_JSON\s*(\{.*?\})\s*-->/s', $content, $m)) {
+            return null;
+        }
+
+        $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
+        $data = json_decode($json, true);
+
+        return json_last_error() === JSON_ERROR_NONE && is_array($data)
+            ? $data
+            : null;
+    }
+
     /**
      * Sammle alle <a href="…"> Links
      */
@@ -150,39 +200,35 @@ class IndexPageListener
      */
     private function detectIndexableFileType(string $url): ?string
     {
-        fwrite(STDERR, "[Meili DEBUG] detectIndexableFileType(): $url\n");
-
+        // Hash entfernen
         $url = strtok($url, '#');
-        $parts = parse_url($url);
 
+        $parts = parse_url($url);
         if (!$parts) {
-            fwrite(STDERR, "[Meili DEBUG]  → parse_url failed\n");
             return null;
         }
 
+        // direkter Pfad (/files/…)
         if (!empty($parts['path'])) {
             $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
-            fwrite(STDERR, "[Meili DEBUG]  → path ext: $ext\n");
-
             if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
                 return $ext;
             }
         }
 
+        // Query-Parameter (Contao 4 + 5)
         if (!empty($parts['query'])) {
             parse_str($parts['query'], $query);
 
             foreach (['file', 'p', 'f'] as $param) {
                 if (!empty($query[$param])) {
-                    $candidate = rawurldecode(
-                        html_entity_decode((string) $query[$param], ENT_QUOTES)
-                    );
+                    $candidate = (string) $query[$param];
+
+                    // sicher decodieren (Contao 4 + 5)
+                    $candidate = html_entity_decode($candidate, ENT_QUOTES);
+                    $candidate = rawurldecode($candidate);
 
                     $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
-                    fwrite(
-                        STDERR,
-                        "[Meili DEBUG]  → query $param=$candidate ext=$ext\n"
-                    );
 
                     if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
                         return $ext;
diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php
index f689c91..57fa73e 100644
--- a/src/Service/PdfIndexService.php
+++ b/src/Service/PdfIndexService.php
@@ -10,119 +10,148 @@ class PdfIndexService
 {
     private string $projectDir;
 
-    // pro PHP-Process genau 1x resetten
     private bool $didReset = false;
-
-    // pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
     private array $seenThisCrawl = [];
 
     public function __construct(ParameterBagInterface $params)
     {
         $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
+        fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n");
     }
 
-    /**
-     * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
-     */
     public function resetTableOnce(): void
     {
         if ($this->didReset) {
+            fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n");
             return;
         }
 
+        fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n");
+
         $this->didReset = true;
         $this->seenThisCrawl = [];
 
         try {
             Database::getInstance()->execute('TRUNCATE tl_search_pdf');
         } catch (\Throwable $e) {
-            error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
+            fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n");
         }
     }
 
-    /**
-     * @param array<int,array{url:string,linkText:?string}> $pdfLinks
-     */
     public function handlePdfLinks(array $pdfLinks): void
     {
+        fwrite(
+            STDERR,
+            "[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n"
+        );
+
         foreach ($pdfLinks as $row) {
             $url = (string) ($row['url'] ?? '');
             $linkText = $row['linkText'] ?? null;
 
+            fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n");
+
             if ($url === '') {
+                fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n");
                 continue;
             }
 
-            try {
-                // innerhalb des Crawls gleiche URL nicht mehrfach parsen
-                $seenKey = md5($url);
-                if (isset($this->seenThisCrawl[$seenKey])) {
-                    continue;
-                }
-                $this->seenThisCrawl[$seenKey] = true;
-
-                $normalizedPath = $this->normalizePdfUrl($url);
-                if ($normalizedPath === null) {
-                    continue;
-                }
-
-                $absolutePath = $this->getAbsolutePath($normalizedPath);
-                if (!is_file($absolutePath)) {
-                    continue;
-                }
-
-                $mtime = (int) (filemtime($absolutePath) ?: 0);
-                $checksum = md5($normalizedPath . '|' . $mtime);
-
-                // Titel-Priorität:
-                // 1) Linktext
-                // 2) PDF-Metadaten Title
-                // 3) Dateiname
-                $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
-                $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
-
-                $text = $this->parsePdf($absolutePath);
-                if ($text === '') {
-                    continue;
-                }
-
-                $this->upsertPdf(
-                    $normalizedPath,
-                    $title,
-                    $text,
-                    $checksum,
-                    $mtime
-                );
-
-            } catch (\Throwable $e) {
-                error_log(
-                    '[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
-                );
+            $seenKey = md5($url);
+            if (isset($this->seenThisCrawl[$seenKey])) {
+                fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n");
+                continue;
             }
+            $this->seenThisCrawl[$seenKey] = true;
+
+            $normalizedPath = $this->normalizePdfUrl($url);
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] normalizePdfUrl() → "
+                . ($normalizedPath ?? 'NULL')
+                . "\n"
+            );
+
+            if ($normalizedPath === null) {
+                fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n");
+                continue;
+            }
+
+            $absolutePath = $this->getAbsolutePath($normalizedPath);
+            fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n");
+
+            if (!is_file($absolutePath)) {
+                fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n");
+                continue;
+            }
+
+            fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n");
+
+            $mtime = (int) (filemtime($absolutePath) ?: 0);
+            $checksum = md5($normalizedPath . '|' . $mtime);
+
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n"
+            );
+
+            $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] metaTitle="
+                . ($pdfMetaTitle ?: 'NULL')
+                . "\n"
+            );
+
+            $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
+            fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n");
+
+            $text = $this->parsePdf($absolutePath);
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n"
+            );
+
+            if ($text === '') {
+                fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n");
+                continue;
+            }
+
+            fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n");
+
+            $this->upsertPdf(
+                $normalizedPath,
+                $title,
+                $text,
+                $checksum,
+                $mtime
+            );
         }
     }
 
     private function normalizePdfUrl(string $url): ?string
     {
-        // Fall 1: direkter /files/-Pfad
+        fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n");
+
         if (str_starts_with($url, '/files/') && preg_match('~\.pdf(\?.*)?$~i', $url)) {
-            return preg_replace('~\?.*$~', '', $url);
+            $r = preg_replace('~\?.*$~', '', $url);
+            fwrite(STDERR, "[Meili PDF DEBUG] → direct /files path {$r}\n");
+            return $r;
         }
 
         $decoded = html_entity_decode($url);
         $parts = parse_url($decoded);
 
-        // Fall 2: absolute URL auf gleiche Site
         if (
             !empty($parts['path'])
             && str_starts_with($parts['path'], '/files/')
             && str_ends_with(strtolower($parts['path']), '.pdf')
         ) {
+            fwrite(STDERR, "[Meili PDF DEBUG] → absolute URL path {$parts['path']}\n");
             return $parts['path'];
         }
 
-        // Fall 3: Contao-Download-Link mit ?p=
         if (empty($parts['query'])) {
+            fwrite(STDERR, "[Meili PDF DEBUG] → no query\n");
             return null;
         }
 
@@ -130,9 +159,12 @@ class PdfIndexService
 
         if (!empty($query['p'])) {
             $p = urldecode((string) $query['p']);
-            return '/files/' . ltrim($p, '/');
+            $r = '/files/' . ltrim($p, '/');
+            fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n");
+            return $r;
         }
 
+        fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n");
         return null;
     }
 
@@ -141,8 +173,13 @@ class PdfIndexService
         return $this->projectDir . '/' . ltrim($relativePath, '/');
     }
 
-    private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
-    {
+    private function upsertPdf(
+        string $url,
+        string $title,
+        string $text,
+        string $checksum,
+        int $mtime
+    ): void {
         try {
             Database::getInstance()
                 ->prepare('
@@ -165,9 +202,12 @@ class PdfIndexService
                     $checksum,
                     $mtime
                 );
+
+            fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n");
         } catch (\Throwable $e) {
-            error_log(
-                '[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n"
             );
         }
     }
@@ -177,18 +217,39 @@ class PdfIndexService
         try {
             $parser = new Parser();
             $pdf = $parser->parseFile($absolutePath);
-
             $text = $this->cleanPdfContent($pdf->getText());
-
             return mb_substr($text, 0, 20000);
         } catch (\Throwable $e) {
-            error_log(
-                '[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n"
             );
             return '';
         }
     }
 
+    private function readPdfMetaTitle(string $absolutePath): ?string
+    {
+        try {
+            $parser = new Parser();
+            $pdf = $parser->parseFile($absolutePath);
+            $details = $pdf->getDetails();
+
+            foreach (['Title', 'title'] as $key) {
+                if (!empty($details[$key]) && is_string($details[$key])) {
+                    return trim($details[$key]);
+                }
+            }
+        } catch (\Throwable $e) {
+            fwrite(
+                STDERR,
+                "[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n"
+            );
+        }
+
+        return null;
+    }
+
     private function cleanPdfContent(string $text): string
     {
         if (class_exists(\Normalizer::class)) {
@@ -198,34 +259,8 @@ class PdfIndexService
         $text = str_replace(["\r\n", "\r"], "\n", $text);
         $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
         $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
-        $text = str_replace(["\\'", "’", "‘"], "'", $text);
         $text = preg_replace('/\s+/u', ' ', $text);
 
         return trim($text);
     }
-
-    private function readPdfMetaTitle(string $absolutePath): ?string
-    {
-        try {
-            $parser = new Parser();
-            $pdf = $parser->parseFile($absolutePath);
-
-            $details = $pdf->getDetails();
-
-            foreach (['Title', 'title'] as $key) {
-                if (!empty($details[$key]) && is_string($details[$key])) {
-                    $t = trim($details[$key]);
-                    if ($t !== '') {
-                        return $t;
-                    }
-                }
-            }
-        } catch (\Throwable $e) {
-            error_log(
-                '[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
-            );
-        }
-
-        return null;
-    }
 }
\ No newline at end of file