From 8f3a0ad1b2519e9ea6afa098e2435e2d9368b10e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= <juergen@MacBookPro.fritz.box>
Date: Fri, 26 Dec 2025 11:24:55 +0100
Subject: [PATCH] Bugfix

---
 src/EventListener/IndexPageListener.php | 184 ++++++++++--------
 src/Service/OfficeIndexService.php      | 243 ++++++++++++++++++++++++
 2 files changed, 352 insertions(+), 75 deletions(-)
 create mode 100644 src/Service/OfficeIndexService.php

diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index 7590ce7..29eab2f 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -5,119 +5,131 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
 use Contao\Config;
 use Contao\System;
 use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService;
+use MummertMedia\ContaoMeilisearchBundle\Service\OfficeIndexService;
 
 class IndexPageListener
 {
     private ?PdfIndexService $pdfIndexService = null;
+    private ?OfficeIndexService $officeIndexService = null;
 
     public function onIndexPage(string $content, array &$data, array &$set): void
     {
         // ✅ IMMER: Service einmal pro Crawl holen + Tabelle einmal leeren
         if ($this->pdfIndexService === null) {
             $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
-            $this->pdfIndexService->resetTableOnce(); // <- darf NICHT von Checkbox abhängen!
-        }
-
-        // ✅ Checkbox steuert nur die PDF-Suche/Indexierung (nicht den Reset!)
-        $pdfEnabled = (bool) (Config::get('meilisearchIndexPdfs') ?? Config::get('meilisearch_index_pdfs'));
-        if (!$pdfEnabled) {
-            return;
-        }
-
-        // Marker vorhanden?
-        if (!str_contains($content, 'MEILISEARCH_JSON')) {
-            return;
-        }
-
-        $parsed = $this->extractMeilisearchJson($content);
-        if ($parsed === null) {
-            return;
+            $this->pdfIndexService->resetTableOnce(); // darf NICHT von Checkboxen abhängen
         }
 
         /*
          * =====================
-         * PRIORITY
+         * SEITEN-METADATEN (IMMER)
          * =====================
          */
-        $priority =
-            $parsed['event']['priority'] ?? null ??
-            $parsed['news']['priority']  ?? null ??
-            $parsed['page']['priority']  ?? null;
+        if (str_contains($content, 'MEILISEARCH_JSON')) {
+            $parsed = $this->extractMeilisearchJson($content);
 
-        if ($priority !== null && $priority !== '') {
-            $set['priority'] = (int) $priority;
-        }
+            if (is_array($parsed)) {
 
-        /*
-         * =====================
-         * KEYWORDS
-         * =====================
-         */
-        $keywordSources = [
-            $parsed['event']['keywords'] ?? null,
-            $parsed['news']['keywords']  ?? null,
-            $parsed['page']['keywords']  ?? null,
-        ];
+                /*
+                 * PRIORITY
+                 */
+                $priority =
+                    $parsed['event']['priority'] ?? null ??
+                    $parsed['news']['priority']  ?? null ??
+                    $parsed['page']['priority']  ?? null;
 
-        $keywords = [];
-        foreach ($keywordSources as $src) {
-            if (!is_string($src) || trim($src) === '') {
-                continue;
-            }
+                if ($priority !== null && $priority !== '') {
+                    $set['priority'] = (int) $priority;
+                }
 
-            foreach (preg_split('/\s+/', trim($src)) as $word) {
-                $word = trim($word);
-                if ($word !== '') {
-                    $keywords[] = $word;
+                /*
+                 * KEYWORDS
+                 */
+                $keywordSources = [
+                    $parsed['event']['keywords'] ?? null,
+                    $parsed['news']['keywords']  ?? null,
+                    $parsed['page']['keywords']  ?? null,
+                ];
+
+                $keywords = [];
+                foreach ($keywordSources as $src) {
+                    if (!is_string($src) || trim($src) === '') {
+                        continue;
+                    }
+
+                    foreach (preg_split('/\s+/', trim($src)) as $word) {
+                        $word = trim($word);
+                        if ($word !== '') {
+                            $keywords[] = $word;
+                        }
+                    }
+                }
+
+                if ($keywords) {
+                    $set['keywords'] = implode(' ', array_unique($keywords));
+                }
+
+                /*
+                 * IMAGEPATH
+                 */
+                $image =
+                    $parsed['event']['searchimage']  ?? null ??
+                    $parsed['news']['searchimage']   ?? null ??
+                    $parsed['page']['searchimage']   ?? null ??
+                    $parsed['custom']['searchimage'] ?? null;
+
+                if (is_string($image) && $image !== '') {
+                    $set['imagepath'] = trim($image);
+                }
+
+                /*
+                 * STARTDATE
+                 */
+                $date =
+                    $parsed['event']['date'] ?? null ??
+                    $parsed['news']['date']  ?? null;
+
+                if (is_string($date) && $date !== '') {
+                    $ts = strtotime($date);
+                    if ($ts !== false) {
+                        $set['startDate'] = $ts;
+                    }
                 }
             }
         }
 
-        if ($keywords) {
-            $set['keywords'] = implode(' ', array_unique($keywords));
-        }
-
         /*
          * =====================
-         * IMAGEPATH
+         * PDF-INDEXIERUNG (OPTIONAL)
          * =====================
          */
-        $image =
-            $parsed['event']['searchimage']  ?? null ??
-            $parsed['news']['searchimage']   ?? null ??
-            $parsed['page']['searchimage']   ?? null ??
-            $parsed['custom']['searchimage'] ?? null;
+        $pdfEnabled = (bool) Config::get('meilisearch_index_pdfs');
+        if ($pdfEnabled && (int) ($data['protected'] ?? 0) === 0) {
 
-        if (is_string($image) && $image !== '') {
-            $set['imagepath'] = trim($image);
-        }
+            $pdfLinks = $this->findPdfLinks($content);
 
-        /*
-         * =====================
-         * STARTDATE
-         * =====================
-         */
-        $date =
-            $parsed['event']['date'] ?? null ??
-            $parsed['news']['date']  ?? null;
-
-        if (is_string($date) && $date !== '') {
-            $ts = strtotime($date);
-            if ($ts !== false) {
-                $set['startDate'] = $ts;
+            if ($pdfLinks !== []) {
+                $this->pdfIndexService->handlePdfLinks($pdfLinks);
             }
         }
 
         /*
          * =====================
-         * PDF-ERKENNUNG
+         * OFFICE-INDEXIERUNG (OPTIONAL)
          * =====================
          */
-        $pdfLinks = $this->findPdfLinks($content);
+        $officeEnabled = (bool) Config::get('meilisearch_index_office');
+        if ($officeEnabled && (int) ($data['protected'] ?? 0) === 0) {
 
-        // PDFs NUR auf öffentlichen Seiten indexieren
-        if ($pdfLinks !== [] && (int) ($data['protected'] ?? 0) === 0) {
-            $this->pdfIndexService->handlePdfLinks($pdfLinks);
+            if ($this->officeIndexService === null) {
+                $this->officeIndexService = System::getContainer()->get(OfficeIndexService::class);
+            }
+
+            $officeLinks = $this->findOfficeLinks($content);
+
+            if ($officeLinks !== []) {
+                $this->officeIndexService->handleOfficeLinks($officeLinks);
+            }
         }
     }
 
@@ -154,4 +166,26 @@ class IndexPageListener
 
         return $result;
     }
+
+    private function findOfficeLinks(string $content): array
+    {
+        if (!preg_match_all(
+            '/<a\s+[^>]*href=["\']([^"\']*(?:\.(?:docx|xlsx|pptx)|p=(?:docx|xlsx|pptx)(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
+            $content,
+            $matches
+        )) {
+            return [];
+        }
+
+        $result = [];
+
+        foreach ($matches[1] as $i => $href) {
+            $result[] = [
+                'url'      => html_entity_decode($href),
+                'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
+            ];
+        }
+
+        return $result;
+    }
 }
\ No newline at end of file
diff --git a/src/Service/OfficeIndexService.php b/src/Service/OfficeIndexService.php
new file mode 100644
index 0000000..8071950
--- /dev/null
+++ b/src/Service/OfficeIndexService.php
@@ -0,0 +1,243 @@
+<?php
+
+namespace MummertMedia\ContaoMeilisearchBundle\Service;
+
+use Contao\Database;
+use PhpOffice\PhpWord\IOFactory as WordIOFactory;
+use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
+use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
+use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
+
+class OfficeIndexService
+{
+    private string $projectDir;
+
+    // pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
+    private array $seenThisCrawl = [];
+
+    public function __construct(ParameterBagInterface $params)
+    {
+        $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
+    }
+
+    /**
+     * @param array<int,array{url:string,linkText:?string}> $officeLinks
+     */
+    public function handleOfficeLinks(array $officeLinks): void
+    {
+        foreach ($officeLinks as $row) {
+            $url = (string) ($row['url'] ?? '');
+            $linkText = $row['linkText'] ?? null;
+
+            if ($url === '') {
+                continue;
+            }
+
+            try {
+                error_log('bearbeite Office-Datei: ' . $url);
+
+                // innerhalb des Crawls gleiche URL nicht mehrfach parsen
+                $seenKey = md5($url);
+                if (isset($this->seenThisCrawl[$seenKey])) {
+                    error_log('→ übersprungen: bereits im Crawl verarbeitet');
+                    continue;
+                }
+                $this->seenThisCrawl[$seenKey] = true;
+
+                $normalized = $this->normalizeOfficeUrl($url);
+                if ($normalized === null) {
+                    error_log('→ übersprungen: kein gültiger Office-Pfad');
+                    continue;
+                }
+
+                [$relativePath, $type] = $normalized;
+
+                $absolutePath = $this->getAbsolutePath($relativePath);
+                if (!is_file($absolutePath)) {
+                    error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
+                    continue;
+                }
+
+                $mtime = (int) (filemtime($absolutePath) ?: 0);
+                $checksum = md5($relativePath . '|' . $mtime);
+
+                $title = $linkText ?: basename($absolutePath);
+
+                $text = $this->parseOfficeFile($absolutePath, $type);
+                if ($text === '') {
+                    error_log('→ übersprungen: Office-Datei ohne Textinhalt');
+                    continue;
+                }
+
+                $this->upsertOffice(
+                    $relativePath,
+                    $title,
+                    $text,
+                    $checksum,
+                    $mtime,
+                    $type
+                );
+
+                error_log('geschrieben in tl_search_pdf');
+
+            } catch (\Throwable $e) {
+                error_log('Office Service FEHLER: ' . $e->getMessage());
+            }
+        }
+    }
+
+    /**
+     * @return array{string,string}|null [relativePath, type]
+     */
+    private function normalizeOfficeUrl(string $url): ?array
+    {
+        $decoded = html_entity_decode($url);
+        $parts = parse_url($decoded);
+
+        // direkter /files/-Pfad
+        if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/')) {
+            $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
+            if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
+                return [$parts['path'], $ext];
+            }
+        }
+
+        // Contao-Download-Link mit ?p=
+        if (!empty($parts['query'])) {
+            parse_str($parts['query'], $query);
+
+            if (!empty($query['p'])) {
+                $p = rawurldecode((string) $query['p']);
+                $ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
+
+                if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
+                    return ['/files/' . ltrim($p, '/'), $ext];
+                }
+            }
+        }
+
+        return null;
+    }
+
+    private function getAbsolutePath(string $relativePath): string
+    {
+        return $this->projectDir . '/' . ltrim($relativePath, '/');
+    }
+
+    private function upsertOffice(
+        string $url,
+        string $title,
+        string $text,
+        string $checksum,
+        int $mtime,
+        string $type
+    ): void {
+        $db = Database::getInstance();
+
+        $db->prepare('
+            INSERT INTO tl_search_pdf
+                (tstamp, type, url, title, text, checksum, file_mtime)
+            VALUES
+                (?, ?, ?, ?, ?, ?, ?)
+            ON DUPLICATE KEY UPDATE
+                tstamp=VALUES(tstamp),
+                type=VALUES(type),
+                url=VALUES(url),
+                title=VALUES(title),
+                text=VALUES(text),
+                file_mtime=VALUES(file_mtime)
+        ')->execute(
+            time(),
+            $type,
+            $url,
+            $title,
+            $text,
+            $checksum,
+            $mtime
+        );
+    }
+
+    private function parseOfficeFile(string $absolutePath, string $type): string
+    {
+        return match ($type) {
+            'docx' => $this->parseDocx($absolutePath),
+            'xlsx' => $this->parseXlsx($absolutePath),
+            'pptx' => $this->parsePptx($absolutePath),
+            default => '',
+        };
+    }
+
+    private function parseDocx(string $absolutePath): string
+    {
+        try {
+            $phpWord = WordIOFactory::load($absolutePath);
+            $text = '';
+
+            foreach ($phpWord->getSections() as $section) {
+                foreach ($section->getElements() as $element) {
+                    if (method_exists($element, 'getText')) {
+                        $text .= ' ' . $element->getText();
+                    }
+                }
+            }
+
+            return $this->cleanText($text);
+
+        } catch (\Throwable) {
+            return '';
+        }
+    }
+
+    private function parseXlsx(string $absolutePath): string
+    {
+        try {
+            $spreadsheet = SpreadsheetIOFactory::load($absolutePath);
+            $text = '';
+
+            foreach ($spreadsheet->getAllSheets() as $sheet) {
+                foreach ($sheet->toArray() as $row) {
+                    $text .= ' ' . implode(' ', array_filter($row, 'is_scalar'));
+                }
+            }
+
+            return $this->cleanText($text);
+
+        } catch (\Throwable) {
+            return '';
+        }
+    }
+
+    private function parsePptx(string $absolutePath): string
+    {
+        try {
+            $presentation = PresentationIOFactory::load($absolutePath);
+            $text = '';
+
+            foreach ($presentation->getAllSlides() as $slide) {
+                foreach ($slide->getShapeCollection() as $shape) {
+                    if (method_exists($shape, 'getPlainText')) {
+                        $text .= ' ' . $shape->getPlainText();
+                    }
+                }
+            }
+
+            return $this->cleanText($text);
+
+        } catch (\Throwable) {
+            return '';
+        }
+    }
+
+    private function cleanText(string $text): string
+    {
+        if (class_exists(\Normalizer::class)) {
+            $text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text;
+        }
+
+        $text = str_replace(["\r\n", "\r"], "\n", $text);
+        $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
+        $text = preg_replace('/\s+/u', ' ', $text);
+
+        return trim(mb_substr($text, 0, 20000));
+    }
+}
\ No newline at end of file