Bugfix

2025-12-25 21:52:22 +01:00
parent 6e6f5904d9
commit bbb4d5cc6c
2 changed files with 127 additions and 41 deletions
@@ -132,13 +132,22 @@ class IndexPageListener
    private function findPdfLinks(string $content): array
    {
        if (!preg_match_all(
-            '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i',
+            '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
            $content,
            $matches
        )) {
            return [];
        }

-        return array_unique(array_map('html_entity_decode', $matches[1]));
+        $result = [];
+
+        foreach ($matches[1] as $i => $href) {
+            $result[] = [
+                'url' => html_entity_decode($href),
+                'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
+            ];
+        }
+
+        return $result;
    }
 }
@@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;

 class PdfIndexService
 {
-    private bool $tableReset = false;
    private string $projectDir;
+    private bool $crawlStarted = false;

    public function __construct(ParameterBagInterface $params)
    {
        $this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
    }

-    /**
-     * 🔥 Wird bei JEDEM Crawl einmal aufgerufen
-     */
-    public function resetTableOnce(): void
+    /* =====================================================
+     * Crawl-Start (immer aufrufen!)
+     * ===================================================== */
+    public function startCrawl(): void
    {
-        if ($this->tableReset) {
+        if ($this->crawlStarted) {
            return;
        }

-        Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
-        error_log('tl_search_pdf wurde geleert');
+        $this->crawlStarted = true;

-        $this->tableReset = true;
+        // bewusst simpel: bei JEDEM Crawl komplett leeren
+        Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
+
+        error_log('PDF Crawl gestartet → tl_search_pdf geleert');
    }

-    /**
-     * Einstiegspunkt vom Listener
-     */
+    /* =====================================================
+     * Einstiegspunkt aus IndexPageListener
+     * ===================================================== */
    public function handlePdfLinks(array $pdfLinks): void
    {
-        foreach ($pdfLinks as $url) {
+        foreach ($pdfLinks as $pdf) {
            try {
-                $path = $this->normalizePdfUrl($url);
-                if ($path === null) {
+                $url = $pdf['url'];
+                $linkText = $pdf['linkText'] ?? null;
+
+                error_log('bearbeite PDF: ' . $url);
+
+                $relativePath = $this->normalizePdfUrl($url);
+                if ($relativePath === null) {
+                    error_log('→ übersprungen: kein gültiger PDF-Pfad');
                    continue;
                }

-                $absolutePath = $this->projectDir . '/' . ltrim($path, '/');
+                $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
                if (!is_file($absolutePath)) {
+                    error_log('→ übersprungen: Datei existiert nicht');
                    continue;
                }

-                $parser = new Parser();
-                $pdf = $parser->parseFile($absolutePath);
-                $text = $this->cleanPdfContent($pdf->getText());
+                $mtime = filemtime($absolutePath) ?: 0;
+                $checksum = md5($relativePath . $mtime);
+
+                // PDF parsen
+                [$text, $metaTitle] = $this->parsePdf($absolutePath);

                if ($text === '') {
+                    error_log('→ übersprungen: kein Textinhalt');
                    continue;
                }

-                Database::getInstance()
-                    ->prepare('
-                        INSERT INTO tl_search_pdf
-                            (tstamp, url, title, text, checksum, file_mtime)
-                        VALUES (?, ?, ?, ?, ?, ?)
-                    ')
-                    ->execute(
-                        time(),
-                        $path,
-                        basename($absolutePath),
-                        mb_substr($text, 0, 5000),
-                        md5($path),
-                        filemtime($absolutePath) ?: 0
-                    );
+                // TITEL-PRIORITÄT
+                $title =
+                    $linkText
+                        ?: $metaTitle
+                        ?: basename($absolutePath);
+
+                $this->insertPdf(
+                    $relativePath,
+                    $title,
+                    $text,
+                    $checksum,
+                    $mtime
+                );
+
+                error_log('→ geschrieben in tl_search_pdf');

            } catch (\Throwable $e) {
-                error_log('PDF Fehler: ' . $e->getMessage());
+                error_log('PDF Service FEHLER: ' . $e->getMessage());
            }
        }
    }
@@ -82,18 +95,22 @@ class PdfIndexService
     * ===================================================== */
    private function normalizePdfUrl(string $url): ?string
    {
-        if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
+        // direkter /files-Link
+        if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) {
            return $url;
        }

-        $parts = parse_url(html_entity_decode($url));
+        // Contao-Download-Link (?p=)
+        $decoded = html_entity_decode($url);
+        $parts = parse_url($decoded);
+
        if (!isset($parts['query'])) {
            return null;
        }

        parse_str($parts['query'], $query);

-        if (!empty($query['p'])) {
+        if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) {
            return '/files/' . ltrim($query['p'], '/');
        }

@@ -101,16 +118,76 @@ class PdfIndexService
    }

    /* =====================================================
-     * Textbereinigung
+     * DB
+     * ===================================================== */
+    private function insertPdf(
+        string $url,
+        string $title,
+        string $text,
+        string $checksum,
+        int $mtime
+    ): void {
+        Database::getInstance()
+            ->prepare('
+                INSERT INTO tl_search_pdf
+                    (tstamp, url, title, text, checksum, file_mtime)
+                VALUES (?, ?, ?, ?, ?, ?)
+            ')
+            ->execute(
+                time(),
+                $url,
+                $title,
+                $text,
+                $checksum,
+                $mtime
+            );
+    }
+
+    /* =====================================================
+     * PDF Parsing
+     * ===================================================== */
+    private function parsePdf(string $absolutePath): array
+    {
+        try {
+            $parser = new Parser();
+            $pdf = $parser->parseFile($absolutePath);
+
+            $details = $pdf->getDetails();
+            $metaTitle = $details['Title'] ?? null;
+
+            $text = $this->cleanPdfContent($pdf->getText());
+
+            return [
+                mb_substr($text, 0, 5000),
+                is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null,
+            ];
+
+        } catch (\Throwable $e) {
+            error_log('PDF Parser FEHLER: ' . $e->getMessage());
+            return ['', null];
+        }
+    }
+
+    /* =====================================================
+     * Text-Bereinigung
     * ===================================================== */
    private function cleanPdfContent(string $text): string
    {
+        // Unicode normalisieren
        if (class_exists(\Normalizer::class)) {
            $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
        }

+        // Sonderglyphen entfernen (Noten, Steuerzeichen etc.)
        $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
-        $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
+
+        // falsche Worttrennungen ("ges pielt")
+        $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
+
+        // Apostrophe vereinheitlichen
+        $text = str_replace(["\\'", "’", "‘"], "'", $text);
+
+        // Mehrfach-Leerzeichen
        $text = preg_replace('/\s+/u', ' ', $text);

        return trim($text);