Bugfix

2025-12-23 13:45:10 +01:00
parent 4a09f8530a
commit 9fa59c4b1a
1 changed files with 86 additions and 52 deletions
@@ -2,6 +2,8 @@
 namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
 use Contao\Database;
 use Contao\StringUtil;
 use Smalot\PdfParser\Parser;
 class IndexPageListener
@@ -13,16 +15,15 @@ class IndexPageListener
            return;
        }
-        // JSON aus Kommentar extrahieren + parsen
+        // JSON aus Kommentar extrahieren
        $parsed = $this->extractMeilisearchJson($content);
        if ($parsed === null) {
            return;
        }
        /*
         * =====================
-         * PRIORITY (event > news > page)
+         * PRIORITY
         * =====================
         */
        $priority =
@@ -36,7 +37,7 @@ class IndexPageListener
        /*
         * =====================
-         * KEYWORDS (merge)
+         * KEYWORDS
         * =====================
         */
        $keywordSources = [
@@ -45,26 +46,26 @@ class IndexPageListener
            $parsed['page']['keywords']  ?? null,
        ];
-        $kw = [];
+        $keywords = [];
-        foreach ($keywordSources as $s) {
+        foreach ($keywordSources as $src) {
-            if (!is_string($s) || trim($s) === '') {
+            if (!is_string($src) || trim($src) === '') {
                continue;
            }
-            foreach (preg_split('/\s+/', trim($s)) ?: [] as $p) {
+            foreach (preg_split('/\s+/', trim($src)) as $word) {
-                if ($p !== '') {
+                if ($word !== '') {
-                    $kw[] = $p;
+                    $keywords[] = $word;
                }
            }
        }
-        if ($kw) {
+        if ($keywords) {
-            $set['keywords'] = implode(' ', array_unique($kw));
+            $set['keywords'] = implode(' ', array_unique($keywords));
        }
        /*
         * =====================
-         * IMAGEPATH (event > news > page > custom)
+         * IMAGEPATH
         * =====================
         */
        $image =
@@ -79,7 +80,7 @@ class IndexPageListener
        /*
         * =====================
-         * STARTDATE (event/news)
+         * STARTDATE
         * =====================
         */
        $date =
@@ -95,20 +96,16 @@ class IndexPageListener
        /*
         * =====================
-         * PDF LINKS INDEXIEREN
+         * PDFS ALS EIGENE DOKUMENTE INDEXIEREN
         * =====================
         */
-        $pdfText = $this->extractPdfTextFromContent($content);
+        $this->indexPdfLinks($content);
        if ($pdfText !== '') {
            $set['text'] = trim(
                ($set['text'] ?? '') . "\n\n" . $pdfText
            );
        }
    }
    /**
-     * Extrahiert das JSON aus <!-- MEILISEARCH_JSON {...} -->
+     * -------------------------------------
     * JSON aus Kommentar extrahieren
     * -------------------------------------
     */
    private function extractMeilisearchJson(string $content): ?array
    {
@@ -123,39 +120,36 @@ class IndexPageListener
    }
    /**
-     * Findet PDF-Links im HTML und extrahiert deren Text
+     * -------------------------------------
     * PDF-Links finden und indexieren
     * -------------------------------------
     */
-    private function extractPdfTextFromContent(string $content): string
+    private function indexPdfLinks(string $content): void
    {
        if (!preg_match_all(
-            '/<a\s+[^>]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>/i',
+            '/<a\s+[^>]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>(.*?)<\/a>/is',
            $content,
            $matches
        )) {
-            return '';
+            return;
        }
-        $texts = [];
+        foreach ($matches[1] as $i => $url) {
-
+            $title = trim(strip_tags($matches[2][$i])) ?: basename($url);
-        foreach ($matches[1] as $url) {
+            $this->indexSinglePdf($url, $title);
            $pdfText = $this->parsePdfFromUrl($url);
            if ($pdfText !== '') {
                $texts[] = $pdfText;
            }
        }
        return implode("\n\n", $texts);
    }
    /**
-     * Lädt und parsed ein PDF (nur /files/)
+     * -------------------------------------
     * Einzelnes PDF indexieren
     * -------------------------------------
     */
-    private function parsePdfFromUrl(string $url): string
+    private function indexSinglePdf(string $url, string $title): void
    {
-        // Nur interne PDFs
+        // nur interne PDFs
        if (!str_contains($url, '/files/')) {
-            return '';
+            return;
        }
        // relative URLs normalisieren
@@ -166,19 +160,62 @@ class IndexPageListener
                . $url;
        }
        $checksum = md5($url);
        $db = Database::getInstance();
        // bereits indexiert?
        $exists = $db
            ->prepare('SELECT id FROM tl_search WHERE checksum=?')
            ->execute($checksum);
        if ($exists->numRows > 0) {
            return;
        }
        $text = $this->parsePdf($url);
        if ($text === '') {
            return;
        }
        $db
            ->prepare('
                INSERT INTO tl_search
                    (tstamp, title, url, text, checksum, protected, pid, type)
                VALUES
                    (?, ?, ?, ?, ?, ?, ?, ?)
            ')
            ->execute(
                time(),
                StringUtil::decodeEntities($title),
                $url,
                $text,
                $checksum,
                '',
                0,
                'file'
            );
    }
    /**
     * -------------------------------------
     * PDF parsen (Smalot)
     * -------------------------------------
     */
    private function parsePdf(string $url): string
    {
        try {
-            $pdfContent = @file_get_contents($url);
+            $content = @file_get_contents($url);
-            if (!$pdfContent) {
+            if (!$content) {
                return '';
            }
            $parser = new Parser();
-            $pdf = $parser->parseContent($pdfContent);
+            $pdf = $parser->parseContent($content);
            $text = $this->cleanPdfContent($pdf->getText());
-            // Begrenzung für Meilisearch
+            // Begrenzen (Performance + Relevanz)
-            return mb_substr($text, 0, 2000);
+            return mb_substr($text, 0, 5000);
        } catch (\Throwable) {
            return '';
@@ -186,17 +223,14 @@ class IndexPageListener
    }
    /**
-     * Bereinigt PDF-Text
+     * -------------------------------------
     * PDF-Text bereinigen
     * -------------------------------------
     */
    private function cleanPdfContent(string $content): string
    {
        // UTF-8 normalisieren
        $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
        // Steuerzeichen entfernen
        $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
        // Whitespaces normalisieren
        $content = preg_replace('/\s+/u', ' ', $content);
        return trim($content);