Bugfix

2025-12-25 21:52:22 +01:00
parent 6e6f5904d9
commit bbb4d5cc6c
2 changed files with 127 additions and 41 deletions
@@ -132,13 +132,22 @@ class IndexPageListener
    private function findPdfLinks(string $content): array
    {
        if (!preg_match_all(
-            '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i',
+            '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
            $content,
            $matches
        )) {
            return [];
        }
-        return array_unique(array_map('html_entity_decode', $matches[1]));
+        $result = [];
        foreach ($matches[1] as $i => $href) {
            $result[] = [
                'url' => html_entity_decode($href),
                'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
            ];
        }
        return $result;
    }
 }
@@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
 class PdfIndexService
 {
    private bool $tableReset = false;
    private string $projectDir;
    private bool $crawlStarted = false;
    public function __construct(ParameterBagInterface $params)
    {
        $this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
    }
-    /**
+    /* =====================================================
-     * 🔥 Wird bei JEDEM Crawl einmal aufgerufen
+     * Crawl-Start (immer aufrufen!)
-     */
+     * ===================================================== */
-    public function resetTableOnce(): void
+    public function startCrawl(): void
    {
-        if ($this->tableReset) {
+        if ($this->crawlStarted) {
            return;
        }
-        Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
+        $this->crawlStarted = true;
        error_log('tl_search_pdf wurde geleert');
-        $this->tableReset = true;
+        // bewusst simpel: bei JEDEM Crawl komplett leeren
        Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
        error_log('PDF Crawl gestartet → tl_search_pdf geleert');
    }
-    /**
+    /* =====================================================
-     * Einstiegspunkt vom Listener
+     * Einstiegspunkt aus IndexPageListener
-     */
+     * ===================================================== */
    public function handlePdfLinks(array $pdfLinks): void
    {
-        foreach ($pdfLinks as $url) {
+        foreach ($pdfLinks as $pdf) {
            try {
-                $path = $this->normalizePdfUrl($url);
+                $url = $pdf['url'];
-                if ($path === null) {
+                $linkText = $pdf['linkText'] ?? null;
                error_log('bearbeite PDF: ' . $url);
                $relativePath = $this->normalizePdfUrl($url);
                if ($relativePath === null) {
                    error_log('→ übersprungen: kein gültiger PDF-Pfad');
                    continue;
                }
-                $absolutePath = $this->projectDir . '/' . ltrim($path, '/');
+                $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
                if (!is_file($absolutePath)) {
                    error_log('→ übersprungen: Datei existiert nicht');
                    continue;
                }
-                $parser = new Parser();
+                $mtime = filemtime($absolutePath) ?: 0;
-                $pdf = $parser->parseFile($absolutePath);
+                $checksum = md5($relativePath . $mtime);
-                $text = $this->cleanPdfContent($pdf->getText());
+
                // PDF parsen
                [$text, $metaTitle] = $this->parsePdf($absolutePath);
                if ($text === '') {
                    error_log('→ übersprungen: kein Textinhalt');
                    continue;
                }
-                Database::getInstance()
+                // TITEL-PRIORITÄT
-                    ->prepare('
+                $title =
-                        INSERT INTO tl_search_pdf
+                    $linkText
-                            (tstamp, url, title, text, checksum, file_mtime)
+                        ?: $metaTitle
-                        VALUES (?, ?, ?, ?, ?, ?)
+                        ?: basename($absolutePath);
-                    ')
+
-                    ->execute(
+                $this->insertPdf(
-                        time(),
+                    $relativePath,
-                        $path,
+                    $title,
-                        basename($absolutePath),
+                    $text,
-                        mb_substr($text, 0, 5000),
+                    $checksum,
-                        md5($path),
+                    $mtime
                        filemtime($absolutePath) ?: 0
                );
                error_log('→ geschrieben in tl_search_pdf');
            } catch (\Throwable $e) {
-                error_log('PDF Fehler: ' . $e->getMessage());
+                error_log('PDF Service FEHLER: ' . $e->getMessage());
            }
        }
    }
@@ -82,18 +95,22 @@ class PdfIndexService
     * ===================================================== */
    private function normalizePdfUrl(string $url): ?string
    {
-        if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
+        // direkter /files-Link
        if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) {
            return $url;
        }
-        $parts = parse_url(html_entity_decode($url));
+        // Contao-Download-Link (?p=)
        $decoded = html_entity_decode($url);
        $parts = parse_url($decoded);
        if (!isset($parts['query'])) {
            return null;
        }
        parse_str($parts['query'], $query);
-        if (!empty($query['p'])) {
+        if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) {
            return '/files/' . ltrim($query['p'], '/');
        }
@@ -101,16 +118,76 @@ class PdfIndexService
    }
    /* =====================================================
-     * Textbereinigung
+     * DB
     * ===================================================== */
    private function insertPdf(
        string $url,
        string $title,
        string $text,
        string $checksum,
        int $mtime
    ): void {
        Database::getInstance()
            ->prepare('
                INSERT INTO tl_search_pdf
                    (tstamp, url, title, text, checksum, file_mtime)
                VALUES (?, ?, ?, ?, ?, ?)
            ')
            ->execute(
                time(),
                $url,
                $title,
                $text,
                $checksum,
                $mtime
            );
    }
    /* =====================================================
     * PDF Parsing
     * ===================================================== */
    private function parsePdf(string $absolutePath): array
    {
        try {
            $parser = new Parser();
            $pdf = $parser->parseFile($absolutePath);
            $details = $pdf->getDetails();
            $metaTitle = $details['Title'] ?? null;
            $text = $this->cleanPdfContent($pdf->getText());
            return [
                mb_substr($text, 0, 5000),
                is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null,
            ];
        } catch (\Throwable $e) {
            error_log('PDF Parser FEHLER: ' . $e->getMessage());
            return ['', null];
        }
    }
    /* =====================================================
     * Text-Bereinigung
     * ===================================================== */
    private function cleanPdfContent(string $text): string
    {
        // Unicode normalisieren
        if (class_exists(\Normalizer::class)) {
            $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
        }
        // Sonderglyphen entfernen (Noten, Steuerzeichen etc.)
        $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
-        $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
+
        // falsche Worttrennungen ("ges pielt")
        $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
        // Apostrophe vereinheitlichen
        $text = str_replace(["\\'", "’", "‘"], "'", $text);
        // Mehrfach-Leerzeichen
        $text = preg_replace('/\s+/u', ' ', $text);
        return trim($text);