From 6e41df002edee2e9678412e83d6495628aff661c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Sun, 11 Jan 2026 18:49:15 +0100 Subject: [PATCH] Tika Title encoding --- src/EventListener/IndexPageListener.php | 128 +++++++++++++++++------- 1 file changed, 94 insertions(+), 34 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 24a97bc..661ea0f 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -3,6 +3,7 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener; use Contao\Config; +use Contao\System; use Doctrine\DBAL\Connection; class IndexPageListener @@ -14,6 +15,8 @@ class IndexPageListener private function debug(string $message, array $context = []): void { + // Debug bewusst immer aktiv (bis du es wieder entfernst) + // Kontext kurz halten, damit Logs nicht explodieren $ctx = $context ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) : ''; @@ -24,7 +27,10 @@ class IndexPageListener public function onIndexPage(string $content, array &$data, array &$set): void { $this->debug('Hook start', [ - 'url' => $data['url'] ?? null, + 'url' => $data['url'] ?? null, + 'protected' => $data['protected'] ?? null, + 'checksum' => $data['checksum'] ?? null, + 'set_keys' => array_keys($set), ]); /* @@ -32,21 +38,79 @@ class IndexPageListener * SEITEN-METADATEN * ===================== */ - if (str_contains($content, 'MEILISEARCH_JSON')) { - $parsed = $this->extractMeilisearchJson($content); + $hasMeta = str_contains($content, 'MEILISEARCH_JSON'); + $this->debug('Meta marker scan', [ + 'contains_MEILISEARCH_JSON' => $hasMeta, + 'content_length' => strlen($content), + ]); + + if ($hasMeta) { + try { + $parsed = $this->extractMeilisearchJson($content); + $this->debug('extractMeilisearchJson(): done', [ + 'parsed_is_array' => is_array($parsed), + 'parsed_keys' => is_array($parsed) ? array_keys($parsed) : null, + ]); + } catch (\Throwable $e) { + $this->debug('Failed to extract MEILISEARCH_JSON', [ + 'error' => $e->getMessage(), + 'class' => $e::class, + ]); + $parsed = null; + } if (is_array($parsed)) { - if (isset($parsed['page']['priority'])) { - $set['priority'] = (int) $parsed['page']['priority']; + + // PRIORITY + $priority = + $parsed['event']['priority'] + ?? $parsed['news']['priority'] + ?? $parsed['page']['priority'] + ?? null; + + if ($priority !== null && $priority !== '') { + $set['priority'] = (int) $priority; } + // KEYWORDS + $keywords = []; + foreach ([ + $parsed['event']['keywords'] ?? null, + $parsed['news']['keywords'] ?? null, + $parsed['page']['keywords'] ?? null, + ] as $src) { + if (is_string($src) && trim($src) !== '') { + foreach (preg_split('/\s+/', trim($src)) as $word) { + $keywords[] = $word; + } + } + } + + if ($keywords) { + $set['keywords'] = implode(' ', array_unique($keywords)); + } + + // IMAGEPATH if (!empty($parsed['page']['searchimage'])) { - $set['imagepath'] = (string) $parsed['page']['searchimage']; + $set['imagepath'] = trim((string) $parsed['page']['searchimage']); } + // STARTDATE + $startDate = + $parsed['event']['startDate'] + ?? $parsed['news']['startDate'] + ?? null; + + if (is_numeric($startDate) && (int) $startDate > 0) { + $set['startDate'] = (int) $startDate; + } + + // CHECKSUM $checksumSeed = (string) ($data['checksum'] ?? ''); + $checksumSeed .= '|' . ($set['keywords'] ?? ''); $checksumSeed .= '|' . ($set['priority'] ?? ''); $checksumSeed .= '|' . ($set['imagepath'] ?? ''); + $checksumSeed .= '|' . ($set['startDate'] ?? ''); $set['checksum'] = md5($checksumSeed); } @@ -54,7 +118,7 @@ class IndexPageListener /* * ===================== - * DATEI-ERKENNUNG + * DATEI-ERKENNUNG + UPSERT * ===================== */ if ((int) ($data['protected'] ?? 0) !== 0) { @@ -83,30 +147,21 @@ class IndexPageListener $time = time(); foreach ($fileLinks as $file) { - $url = strtok($file['url'], '#'); /* * ===================== - * 🔥 DOWNLOAD-URL-NORMALISIERUNG (FIX) + * URL-NORMALISIERUNG (NEU) * ===================== */ if (str_contains($url, 'p=')) { - parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $q); - - if (!empty($q['p'])) { - $url = '/' . ltrim(rawurldecode($q['p']), '/'); - - $this->debug('Normalized download URL', [ - 'normalized' => $url, - ]); + parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $query); + if (!empty($query['p'])) { + $url = '/' . ltrim(rawurldecode($query['p']), '/'); + $this->debug('Normalized download URL', ['url' => $url]); } } - if (!is_string($url) || $url === '') { - continue; - } - $path = parse_url($url, PHP_URL_PATH); $abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null; @@ -115,11 +170,11 @@ class IndexPageListener try { $existing = $db->fetchAssociative( - 'SELECT id FROM tl_search_files WHERE url = ?', + 'SELECT id, checksum FROM tl_search_files WHERE url = ?', [$url] ); } catch (\Throwable $e) { - $this->debug('DB error (file indexing)', [ + $this->debug('DB error', [ 'url' => $url, 'error' => $e->getMessage(), ]); @@ -130,11 +185,11 @@ class IndexPageListener $db->update( 'tl_search_files', [ - 'tstamp' => $time, - 'last_seen' => $time, - 'page_id' => (int) ($data['pid'] ?? 0), - 'file_mtime' => $mtime, - 'checksum' => $checksum, + 'tstamp' => $time, + 'last_seen' => $time, + 'page_id' => (int) ($data['pid'] ?? 0), + 'file_mtime' => $mtime, + 'checksum' => $checksum, ], ['id' => $existing['id']] ); @@ -155,10 +210,12 @@ class IndexPageListener } } - $this->debug('Hook end'); + $this->debug('Hook end', [ + 'final_set_keys' => array_keys($set), + ]); } - /* === Hilfsmethoden === */ + /* === Hilfsmethoden unverändert === */ private function extractMeilisearchJson(string $content): ?array { @@ -166,9 +223,12 @@ class IndexPageListener return null; } - $data = json_decode(trim($m[1]), true); + $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1])); + $data = json_decode($json, true); - return json_last_error() === JSON_ERROR_NONE ? $data : null; + return json_last_error() === JSON_ERROR_NONE && is_array($data) + ? $data + : null; } private function findAllLinks(string $content): array @@ -207,10 +267,10 @@ class IndexPageListener if (!empty($parts['query'])) { parse_str($parts['query'], $query); - foreach (['file', 'p', 'f'] as $param) { if (!empty($query[$param])) { - $ext = strtolower(pathinfo($query[$param], PATHINFO_EXTENSION)); + $candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES)); + $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { return $ext; }