From 3427f6b60b3af956992f7be1409329d74c51709b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Sun, 11 Jan 2026 18:51:38 +0100 Subject: [PATCH] Tika Title encoding --- src/EventListener/IndexPageListener.php | 201 ++++++++++++++---------- 1 file changed, 116 insertions(+), 85 deletions(-) diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 661ea0f..05954fe 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -4,23 +4,18 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener; use Contao\Config; use Contao\System; -use Doctrine\DBAL\Connection; class IndexPageListener { - public function __construct( - private readonly Connection $connection, - ) { + public function __construct() + { } private function debug(string $message, array $context = []): void { // Debug bewusst immer aktiv (bis du es wieder entfernst) // Kontext kurz halten, damit Logs nicht explodieren - $ctx = $context - ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) - : ''; - + $ctx = $context ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) : ''; error_log('[ContaoMeilisearch][IndexPageListener] ' . $message . $ctx); } @@ -68,21 +63,28 @@ class IndexPageListener ?? $parsed['page']['priority'] ?? null; + $this->debug('Meta: priority candidate', ['priority' => $priority]); + if ($priority !== null && $priority !== '') { $set['priority'] = (int) $priority; } // KEYWORDS + $keywordSources = [ + $parsed['event']['keywords'] ?? null, + $parsed['news']['keywords'] ?? null, + $parsed['page']['keywords'] ?? null, + ]; + + $this->debug('Meta: keyword sources', ['sources' => $keywordSources]); + $keywords = []; - foreach ([ - $parsed['event']['keywords'] ?? null, - $parsed['news']['keywords'] ?? null, - $parsed['page']['keywords'] ?? null, - ] as $src) { - if (is_string($src) && trim($src) !== '') { - foreach (preg_split('/\s+/', trim($src)) as $word) { - $keywords[] = $word; - } + foreach ($keywordSources as $src) { + if (!is_string($src) || trim($src) === '') { + continue; + } + foreach (preg_split('/\s+/', trim($src)) as $word) { + $keywords[] = $word; } } @@ -90,9 +92,16 @@ class IndexPageListener $set['keywords'] = implode(' ', array_unique($keywords)); } - // IMAGEPATH - if (!empty($parsed['page']['searchimage'])) { - $set['imagepath'] = trim((string) $parsed['page']['searchimage']); + $this->debug('Meta: keywords result', [ + 'keywords' => $set['keywords'] ?? null, + ]); + + // IMAGEPATH (UUID) + $searchImage = $parsed['page']['searchimage'] ?? null; + $this->debug('Meta: searchimage candidate', ['searchimage' => $searchImage]); + + if (!empty($searchImage)) { + $set['imagepath'] = trim((string) $searchImage); } // STARTDATE @@ -101,18 +110,32 @@ class IndexPageListener ?? $parsed['news']['startDate'] ?? null; + $this->debug('Meta: startDate candidate', ['startDate' => $startDate]); + if (is_numeric($startDate) && (int) $startDate > 0) { $set['startDate'] = (int) $startDate; } // CHECKSUM - $checksumSeed = (string) ($data['checksum'] ?? ''); - $checksumSeed .= '|' . ($set['keywords'] ?? ''); - $checksumSeed .= '|' . ($set['priority'] ?? ''); - $checksumSeed .= '|' . ($set['imagepath'] ?? ''); - $checksumSeed .= '|' . ($set['startDate'] ?? ''); + try { + $checksumSeed = (string) ($data['checksum'] ?? ''); + $checksumSeed .= '|' . ($set['keywords'] ?? ''); + $checksumSeed .= '|' . ($set['priority'] ?? ''); + $checksumSeed .= '|' . ($set['imagepath'] ?? ''); + $checksumSeed .= '|' . ($set['startDate'] ?? ''); - $set['checksum'] = md5($checksumSeed); + $set['checksum'] = md5($checksumSeed); + + $this->debug('Checksum generated', [ + 'seed_preview' => substr($checksumSeed, 0, 120) . (strlen($checksumSeed) > 120 ? '…' : ''), + 'checksum' => $set['checksum'], + ]); + } catch (\Throwable $e) { + $this->debug('Failed to generate checksum', [ + 'error' => $e->getMessage(), + 'class' => $e::class, + ]); + } } } @@ -122,14 +145,24 @@ class IndexPageListener * ===================== */ if ((int) ($data['protected'] ?? 0) !== 0) { + $this->debug('Abort: protected page', ['protected' => $data['protected'] ?? null]); return; } - if (!Config::get('meilisearch_index_files')) { + $indexFiles = (bool) Config::get('meilisearch_index_files'); + + $this->debug('File indexing setting', [ + 'meilisearch_index_files' => $indexFiles, + ]); + + if (!$indexFiles) { + $this->debug('Abort: file indexing disabled'); return; } $links = $this->findAllLinks($content); + $this->debug('Links found', ['count' => count($links)]); + $fileLinks = []; foreach ($links as $link) { @@ -139,74 +172,66 @@ class IndexPageListener } } - if (!$fileLinks) { - return; - } + $this->debug('Indexable file links found', [ + 'count' => count($fileLinks), + 'types' => array_count_values(array_column($fileLinks, 'type')), + ]); - $db = $this->connection; - $time = time(); + if ($fileLinks) { + $db = System::getContainer()->get('database_connection'); + $time = time(); - foreach ($fileLinks as $file) { - $url = strtok($file['url'], '#'); + foreach ($fileLinks as $file) { + $url = strtok($file['url'], '#'); - /* - * ===================== - * URL-NORMALISIERUNG (NEU) - * ===================== - */ - if (str_contains($url, 'p=')) { - parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $query); - if (!empty($query['p'])) { - $url = '/' . ltrim(rawurldecode($query['p']), '/'); - $this->debug('Normalized download URL', ['url' => $url]); - } - } + $path = parse_url($url, PHP_URL_PATH); + $abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null; - $path = parse_url($url, PHP_URL_PATH); - $abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null; + $mtime = ($abs && is_file($abs)) ? filemtime($abs) : 0; + $checksum = md5($url . '|' . $mtime); - $mtime = ($abs && is_file($abs)) ? filemtime($abs) : 0; - $checksum = md5($url . '|' . $mtime); - - try { $existing = $db->fetchAssociative( 'SELECT id, checksum FROM tl_search_files WHERE url = ?', [$url] ); - } catch (\Throwable $e) { - $this->debug('DB error', [ - 'url' => $url, - 'error' => $e->getMessage(), - ]); - continue; - } - if ($existing) { - $db->update( - 'tl_search_files', - [ - 'tstamp' => $time, - 'last_seen' => $time, - 'page_id' => (int) ($data['pid'] ?? 0), - 'file_mtime' => $mtime, - 'checksum' => $checksum, - ], - ['id' => $existing['id']] - ); - } else { - $db->insert( - 'tl_search_files', - [ - 'tstamp' => $time, - 'last_seen' => $time, - 'type' => $file['type'], - 'url' => $url, - 'title' => $file['linkText'] ?? basename($url), - 'page_id' => (int) ($data['pid'] ?? 0), - 'file_mtime' => $mtime, - 'checksum' => $checksum, - ] - ); + if ($existing) { + $db->update( + 'tl_search_files', + [ + 'tstamp' => $time, + 'last_seen' => $time, + 'page_id' => (int) ($data['pid'] ?? 0), + 'file_mtime' => $mtime, + 'checksum' => $checksum, + ], + ['id' => $existing['id']] + ); + + $this->debug('File updated', [ + 'url' => $url, + 'checksum' => $checksum, + ]); + } else { + $db->insert( + 'tl_search_files', + [ + 'tstamp' => $time, + 'last_seen' => $time, + 'type' => $file['type'], + 'url' => $url, + 'title' => $file['linkText'] ?? basename($url), + 'page_id' => (int) ($data['pid'] ?? 0), + 'file_mtime' => $mtime, + 'checksum' => $checksum, + ] + ); + + $this->debug('File inserted', [ + 'url' => $url, + 'checksum' => $checksum, + ]); + } } } @@ -256,7 +281,11 @@ class IndexPageListener private function detectIndexableFileType(string $url): ?string { $url = strtok($url, '#'); + $parts = parse_url($url); + if (!$parts) { + return null; + } if (!empty($parts['path'])) { $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); @@ -267,10 +296,12 @@ class IndexPageListener if (!empty($parts['query'])) { parse_str($parts['query'], $query); + foreach (['file', 'p', 'f'] as $param) { if (!empty($query[$param])) { $candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES)); $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); + if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { return $ext; }