From b59bda40fc04418178d0f4dbb3b49cb721052188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Mon, 9 Mar 2026 10:06:05 +0100 Subject: [PATCH] Rename bundle namespace and prepare release 0.6.0 --- .gitignore | 6 + README.md | 73 +++++ composer.json | 21 ++ icon.svg | 22 ++ .../MeilisearchFilesCleanupCommand.php | 100 ++++++ src/Command/MeilisearchFilesParseCommand.php | 277 +++++++++++++++++ src/Command/MeilisearchIndexCommand.php | 57 ++++ src/ContaoManager/Plugin.php | 29 ++ src/ContaoMeilisearchBundle.php | 9 + .../MeilisearchSearchController.php | 26 ++ .../ContaoMeilisearchExtension.php | 17 + src/EventListener/IndexPageListener.php | 223 +++++++++++++ .../MeilisearchPageMarkerListener.php | 293 ++++++++++++++++++ src/Resources/config/services.yaml | 26 ++ src/Resources/contao/config/config.php | 10 + .../contao/dca/tl_calendar_events.php | 36 +++ src/Resources/contao/dca/tl_module.php | 19 ++ src/Resources/contao/dca/tl_news.php | 32 ++ src/Resources/contao/dca/tl_page.php | 46 +++ src/Resources/contao/dca/tl_search.php | 36 +++ src/Resources/contao/dca/tl_search_files.php | 95 ++++++ src/Resources/contao/dca/tl_settings.php | 136 ++++++++ src/Resources/contao/languages/de/default.php | 7 + src/Resources/contao/languages/de/modules.php | 7 + .../languages/de/tl_calendar_events.php | 11 + .../contao/languages/de/tl_module.php | 7 + src/Resources/contao/languages/de/tl_news.php | 11 + src/Resources/contao/languages/de/tl_page.php | 14 + .../contao/languages/de/tl_settings.php | 37 +++ .../mod_meilisearch_search.html.twig | 273 ++++++++++++++++ src/Resources/public/icons/filetype-docx.svg | 3 + src/Resources/public/icons/filetype-pdf.svg | 3 + src/Resources/public/icons/filetype-pptx.svg | 3 + src/Resources/public/icons/filetype-xlsx.svg | 3 + src/Service/MeilisearchFileHelper.php | 259 ++++++++++++++++ src/Service/MeilisearchImageHelper.php | 84 +++++ src/Service/MeilisearchIndexService.php | 271 ++++++++++++++++ 37 files changed, 2582 insertions(+) create mode 100755 .gitignore create mode 100755 README.md create mode 100755 composer.json create mode 100755 icon.svg create mode 100755 src/Command/MeilisearchFilesCleanupCommand.php create mode 100755 src/Command/MeilisearchFilesParseCommand.php create mode 100755 src/Command/MeilisearchIndexCommand.php create mode 100755 src/ContaoManager/Plugin.php create mode 100755 src/ContaoMeilisearchBundle.php create mode 100755 src/Controller/FrontendModule/MeilisearchSearchController.php create mode 100755 src/DependencyInjection/ContaoMeilisearchExtension.php create mode 100755 src/EventListener/IndexPageListener.php create mode 100755 src/EventListener/MeilisearchPageMarkerListener.php create mode 100755 src/Resources/config/services.yaml create mode 100755 src/Resources/contao/config/config.php create mode 100755 src/Resources/contao/dca/tl_calendar_events.php create mode 100755 src/Resources/contao/dca/tl_module.php create mode 100755 src/Resources/contao/dca/tl_news.php create mode 100755 src/Resources/contao/dca/tl_page.php create mode 100755 src/Resources/contao/dca/tl_search.php create mode 100755 src/Resources/contao/dca/tl_search_files.php create mode 100755 src/Resources/contao/dca/tl_settings.php create mode 100755 src/Resources/contao/languages/de/default.php create mode 100755 src/Resources/contao/languages/de/modules.php create mode 100755 src/Resources/contao/languages/de/tl_calendar_events.php create mode 100755 src/Resources/contao/languages/de/tl_module.php create mode 100755 src/Resources/contao/languages/de/tl_news.php create mode 100755 src/Resources/contao/languages/de/tl_page.php create mode 100755 src/Resources/contao/languages/de/tl_settings.php create mode 100755 src/Resources/contao/templates/frontend_module/mod_meilisearch_search.html.twig create mode 100755 src/Resources/public/icons/filetype-docx.svg create mode 100755 src/Resources/public/icons/filetype-pdf.svg create mode 100755 src/Resources/public/icons/filetype-pptx.svg create mode 100755 src/Resources/public/icons/filetype-xlsx.svg create mode 100755 src/Service/MeilisearchFileHelper.php create mode 100755 src/Service/MeilisearchImageHelper.php create mode 100755 src/Service/MeilisearchIndexService.php diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..b174abf --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea/ +vendor/ +var/cache/ +var/log/ +.vscode/ +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md new file mode 100755 index 0000000..1db3daf --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +# Contao Meilisearch Bundle + +Eine schlanke Schnittstelle zwischen **Contao CMS (4.13 / 5.6 / 5.7 ready) unter PHP 8.4** und einer **selbst gehosteten Meilisearch-Instanz**. +Das Bundle erweitert den Contao-Suchindex um strukturierte Daten und ermöglicht eine performante, moderne Volltextsuche. +Das Parsen von Dateien erfolgt über eine Apache-Tika-Instanz, welche extern bereitgestellt werden muss. + +--- + +## ✨ Features + +- Integration von **Meilisearch** als externe Suchmaschine +- Indexierung von: + - Contao-Seiten + - Inhaltselementen + - **PDF-Dateien** + - **Office-Dokumenten** (DOCX, XLSX, PPTX) +- Unterstützung für: + - Seiten-Prioritäten + - Keywords + - Vorschaubild +- Kompatibel mit: + - Contao **4.13**, **5.6** und **5.7** + - PHP **8.4** + +--- + +## ⏱️ Scheduled Indexing (Cron setup) + +Das Bundle stellt eigene Commands zur Verfügung, um Dateien zu bereinigen und den Meilisearch-Index neu aufzubauen. +Für den produktiven Einsatz wird empfohlen, diese Commands regelmäßig per **System-Crontab** auszuführen. + +Das Bundle nutzt **keinen eigenen Contao-Cron**, sondern System-Cronjobs. + +## Verfügbare Commands + +### Datei-Cleanup + +``` +/vendor/bin/contao-console meilisearch:files:cleanup +``` + +### Datei-Parsing + +``` +/vendor/bin/contao-console meilisearch:files:parse +``` + +### Meilisearch-Index + +``` +/vendor/bin/contao-console meilisearch:index +``` + + + +## Beispiel Crontab + +``` +0 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:files:cleanup +1 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console contao:crawl +10 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:files:parse +20 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:index +``` + +## Logging + +``` +>> var/logs/meilisearch_cron.log 2>&1 +``` + +## Lizenz + +MIT \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100755 index 0000000..2fb9c02 --- /dev/null +++ b/composer.json @@ -0,0 +1,21 @@ +{ + "name": "mummert/contao-meilisearch-bundle", + "description": "Contao Meilisearch integration bundle", + "type": "contao-bundle", + "license": "MIT", + "require": { + "php": "^8.3", + "contao/core-bundle": "^4.13 || ^5.6 || ^5.7", + "contao/calendar-bundle": "^4.13 || ^5.6 || ^5.7", + "contao/news-bundle": "^4.13 || ^5.6 || ^5.7", + "meilisearch/meilisearch-php": "^1.16" + }, + "autoload": { + "psr-4": { + "Mummert\\ContaoMeilisearchBundle\\": "src/" + } + }, + "extra": { + "contao-manager-plugin": "Mummert\\ContaoMeilisearchBundle\\ContaoManager\\Plugin" + } +} \ No newline at end of file diff --git a/icon.svg b/icon.svg new file mode 100755 index 0000000..0ac5a90 --- /dev/null +++ b/icon.svg @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/Command/MeilisearchFilesCleanupCommand.php b/src/Command/MeilisearchFilesCleanupCommand.php new file mode 100755 index 0000000..6a22261 --- /dev/null +++ b/src/Command/MeilisearchFilesCleanupCommand.php @@ -0,0 +1,100 @@ +setName('meilisearch:files:cleanup') + ->setDescription('Remove stale indexed files from tl_search_files') + ->addOption( + 'grace', + null, + InputOption::VALUE_OPTIONAL, + 'Grace period in seconds (files newer than now-grace are kept)', + 86400 + ) + ->addOption( + 'dry-run', + null, + InputOption::VALUE_NONE, + 'Show how many entries would be removed without deleting them' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->framework->initialize(); + + $this->log('Cleaner gestartet'); + + try { + $grace = max(0, (int) $input->getOption('grace')); + $dryRun = (bool) $input->getOption('dry-run'); + $cutoff = time() - $grace; + + if ($dryRun) { + $count = $this->connection->fetchOne( + 'SELECT COUNT(*) FROM tl_search_files WHERE last_seen < ?', + [$cutoff] + ); + + $message = sprintf( + '[DRY-RUN] %d stale file(s) would be removed (last_seen < %s)', + $count, + date('Y-m-d H:i:s', $cutoff) + ); + + $output->writeln('' . $message . ''); + $this->log($message); + + $this->log('Cleaner stopped (dry-run)'); + return Command::SUCCESS; + } + + $affected = $this->connection->executeStatement( + 'DELETE FROM tl_search_files WHERE last_seen < ?', + [$cutoff] + ); + + $message = sprintf( + 'Removed %d stale file(s) (last_seen < %s)', + $affected, + date('Y-m-d H:i:s', $cutoff) + ); + + $output->writeln('' . $message . ''); + $this->log($message); + + $this->log('Cleaner successfully stopped'); + return Command::SUCCESS; + + } catch (\Throwable $e) { + $this->log('Cleaner ERROR: ' . $e->getMessage()); + $output->writeln('' . $e->getMessage() . ''); + + return Command::FAILURE; + } + } + + private function log(string $message): void + { + error_log(sprintf('[%s] %s', date('Y-m-d H:i:s'), $message)); + } +} \ No newline at end of file diff --git a/src/Command/MeilisearchFilesParseCommand.php b/src/Command/MeilisearchFilesParseCommand.php new file mode 100755 index 0000000..9bcc4d9 --- /dev/null +++ b/src/Command/MeilisearchFilesParseCommand.php @@ -0,0 +1,277 @@ +setName('meilisearch:files:parse') + ->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files') + ->addOption( + 'limit', + null, + InputOption::VALUE_OPTIONAL, + 'Maximum number of files to check per run' + ) + ->addOption( + 'dry-run', + null, + InputOption::VALUE_NONE, + 'Do not send files to Tika' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->framework->initialize(); + $this->log('Parser gestartet'); + + $dryRun = (bool) $input->getOption('dry-run'); + + $limitOption = $input->getOption('limit'); + $limit = $limitOption !== null ? max(1, (int) $limitOption) : null; + + $tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/'); + if ($tikaUrl === '') { + $output->writeln('Tika URL not configured'); + return Command::FAILURE; + } + + $db = Database::getInstance(); + + $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; + if ($limit !== null) { + $sql .= " LIMIT " . (int) $limit; + } + + $files = $db->query($sql)->fetchAllAssoc(); + + if (!$files) { + $this->log('No files to parse'); + return Command::SUCCESS; + } + + $client = HttpClient::create([ + 'timeout' => 180, + ]); + + foreach ($files as $file) { + + $originalUrl = (string) $file['url']; + $existingTitle = trim((string) ($file['title'] ?? '')); + $normalized = $originalUrl; + + // ------------------------------------------------- + // Normalize URL + // ------------------------------------------------- + if (str_contains($normalized, '?')) { + $parts = parse_url($normalized); + if (!empty($parts['query'])) { + parse_str($parts['query'], $query); + if (!empty($query['file'])) { + $normalized = (string) $query['file']; + } else { + $this->log('Not a direct file url, skip', ['url' => $originalUrl]); + continue; + } + } + } + + $normalized = strtok($normalized, '#'); + $normalized = rawurldecode($normalized); + $normalized = ltrim($normalized, '/'); + + if (!str_starts_with($normalized, 'files/')) { + $this->log('Not in files/, skip', ['url' => $originalUrl]); + continue; + } + + $root = defined('TL_ROOT') + ? TL_ROOT + : System::getContainer()->getParameter('kernel.project_dir') . '/public'; + + $absolutePath = $root . '/' . $normalized; + + if (!is_file($absolutePath)) { + $this->log('File missing, skip', [ + 'url' => $originalUrl, + 'path' => $absolutePath, + ]); + continue; + } + + $mtime = filemtime($absolutePath) ?: 0; + $checksum = md5($normalized . '|' . $mtime); + + // ------------------------------------------------- + // Skip unchanged + // ------------------------------------------------- + if ($file['checksum'] === $checksum && !empty($file['text'])) { + continue; + } + + if ($dryRun) { + $output->writeln('[DRY-RUN] Would parse: ' . $normalized); + continue; + } + + // ------------------------------------------------- + // MIME-Type + // ------------------------------------------------- + $ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION)); + + $mimeType = match ($ext) { + 'pdf' => 'application/pdf', + 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + default => null, + }; + + if ($mimeType === null) { + $this->log('Unsupported file type, skip', ['url' => $normalized]); + continue; + } + + // ------------------------------------------------- + // Tika BODY (roher Plaintext) + // ------------------------------------------------- + try { + $this->log('Parsing file', ['url' => $normalized]); + + $bodyResponse = $client->request( + 'PUT', + $tikaUrl . '/tika/main', + [ + 'headers' => [ + 'Accept' => 'text/plain', + 'Content-Type' => $mimeType, + ], + 'body' => fopen($absolutePath, 'rb'), + ] + ); + + $text = trim((string) $bodyResponse->getContent(false)); + + } catch (\Throwable $e) { + $this->log('Body parse failed', [ + 'url' => $normalized, + 'error' => $e->getMessage(), + ]); + continue; + } + + // ------------------------------------------------- + // TITLE: keep existing editor-defined title + // ------------------------------------------------- + $title = $existingTitle !== '' ? $existingTitle : null; + + // ------------------------------------------------- + // Tika METADATA (Title) – only if no existing title + // ------------------------------------------------- + if ($title === null) { + try { + $metaResponse = $client->request( + 'PUT', + $tikaUrl . '/meta', + [ + 'headers' => [ + 'Accept' => 'application/json', + 'Content-Type' => $mimeType, + ], + 'body' => fopen($absolutePath, 'rb'), + ] + ); + + $meta = json_decode($metaResponse->getContent(false), true); + + $rawTitle = + $meta['dc:title'][0] + ?? $meta['pdf:docinfo:title'][0] + ?? null; + + if ($rawTitle) { + $title = html_entity_decode( + $rawTitle, + ENT_QUOTES | ENT_HTML5, + 'UTF-8' + ); + } + + } catch (\Throwable) { + // Metadata optional + } + } + + // ------------------------------------------------- + // TITLE → ASCII SAFE (only if newly generated) + // ------------------------------------------------- + if ($existingTitle === '' && $title) { + $title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title); + $title = preg_replace('/\s+/', ' ', $title); + $title = trim($title); + } + + // ------------------------------------------------- + // FALLBACK: Dateiname (only if still empty) + // ------------------------------------------------- + if (!$title || strlen($title) < 5) { + $title = pathinfo($normalized, PATHINFO_FILENAME); + $title = str_replace(['_', '-'], ' ', $title); + $title = preg_replace('/\s+/', ' ', $title); + $title = trim($title); + } + + // ------------------------------------------------- + // Store result + // ------------------------------------------------- + $db->prepare( + "UPDATE tl_search_files + SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ? + WHERE id = ?" + )->execute( + $text, + $title, + $checksum, + $mtime, + time(), + $file['id'] + ); + + $this->log('File parsed', [ + 'url' => $normalized, + 'chars' => mb_strlen($text), + 'title' => $title, + ]); + } + + $this->log('Parser finished'); + return Command::SUCCESS; + } + + private function log(string $message, array $context = []): void + { + $ctx = $context + ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) + : ''; + + error_log('[MeilisearchFilesParse] ' . $message . $ctx); + } +} \ No newline at end of file diff --git a/src/Command/MeilisearchIndexCommand.php b/src/Command/MeilisearchIndexCommand.php new file mode 100755 index 0000000..f9d389b --- /dev/null +++ b/src/Command/MeilisearchIndexCommand.php @@ -0,0 +1,57 @@ +setName('meilisearch:index') + ->setDescription('Rebuild Meilisearch index from Contao search tables'); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->log('Meilisearch index gestartet'); + $output->writeln('Meilisearch index started'); + + try { + $this->indexService->run(); + + $this->log('Meilisearch index successfully stopped'); + $output->writeln('Meilisearch index finished'); + + return Command::SUCCESS; + + } catch (\Throwable $e) { + $this->log('Meilisearch index ERROR: ' . $e->getMessage()); + $output->writeln('' . $e->getMessage() . ''); + + return Command::FAILURE; + } + } + + /** + * Einheitliches Logging mit Zeitstempel + */ + private function log(string $message): void + { + error_log(sprintf( + '[%s] %s', + date('Y-m-d H:i:s'), + $message + )); + } +} \ No newline at end of file diff --git a/src/ContaoManager/Plugin.php b/src/ContaoManager/Plugin.php new file mode 100755 index 0000000..2ac00d2 --- /dev/null +++ b/src/ContaoManager/Plugin.php @@ -0,0 +1,29 @@ +setLoadAfter([ + ContaoCoreBundle::class, + ContaoCalendarBundle::class, + ContaoNewsBundle::class, + ]), + ]; + } +} \ No newline at end of file diff --git a/src/ContaoMeilisearchBundle.php b/src/ContaoMeilisearchBundle.php new file mode 100755 index 0000000..13a8eca --- /dev/null +++ b/src/ContaoMeilisearchBundle.php @@ -0,0 +1,9 @@ +meiliLimit = (int) ($model->meiliLimit ?: 50); + $template->meiliHost = Config::get('meilisearch_host'); + $template->meiliIndex = Config::get('meilisearch_index'); + $template->meiliSearchKey = Config::get('meilisearch_api_search'); + + return $template->getResponse(); + } +} \ No newline at end of file diff --git a/src/DependencyInjection/ContaoMeilisearchExtension.php b/src/DependencyInjection/ContaoMeilisearchExtension.php new file mode 100755 index 0000000..0a0da73 --- /dev/null +++ b/src/DependencyInjection/ContaoMeilisearchExtension.php @@ -0,0 +1,17 @@ +load('services.yaml'); + } +} \ No newline at end of file diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php new file mode 100755 index 0000000..33ee15a --- /dev/null +++ b/src/EventListener/IndexPageListener.php @@ -0,0 +1,223 @@ +debug('Hook start', [ + 'url' => $data['url'] ?? null, + 'protected' => $data['protected'] ?? null, + 'checksum' => $data['checksum'] ?? null, + 'set_keys' => array_keys($set), + ]); + + /* + * ===================== + * SEITEN-METADATEN + * ===================== + */ + $hasMeta = str_contains($content, 'MEILISEARCH_JSON'); + $this->debug('Meta marker scan', [ + 'contains_MEILISEARCH_JSON' => $hasMeta, + 'content_length' => strlen($content), + ]); + + if ($hasMeta) { + try { + $parsed = $this->extractMeilisearchJson($content); + $this->debug('extractMeilisearchJson(): done', [ + 'parsed_is_array' => is_array($parsed), + 'parsed_keys' => is_array($parsed) ? array_keys($parsed) : null, + ]); + } catch (\Throwable $e) { + $this->debug('Failed to extract MEILISEARCH_JSON', [ + 'error' => $e->getMessage(), + 'class' => $e::class, + ]); + $parsed = null; + } + + if (is_array($parsed)) { + + // PRIORITY + $priority = + $parsed['event']['priority'] + ?? $parsed['news']['priority'] + ?? $parsed['page']['priority'] + ?? null; + + $this->debug('Meta: priority candidate', ['priority' => $priority]); + + if ($priority !== null && $priority !== '') { + $set['priority'] = (int) $priority; + } + + // KEYWORDS + $keywordSources = [ + $parsed['event']['keywords'] ?? null, + $parsed['news']['keywords'] ?? null, + $parsed['page']['keywords'] ?? null, + ]; + + $keywords = []; + foreach ($keywordSources as $src) { + if (!is_string($src) || trim($src) === '') { + continue; + } + foreach (preg_split('/\s+/', trim($src)) as $word) { + $keywords[] = $word; + } + } + + if ($keywords) { + $set['keywords'] = implode(' ', array_unique($keywords)); + } + + // IMAGEPATH + if (!empty($parsed['page']['searchimage'] ?? null)) { + $set['imagepath'] = trim((string) $parsed['page']['searchimage']); + } + + // STARTDATE + if (is_numeric($parsed['event']['startDate'] ?? null)) { + $set['startDate'] = (int) $parsed['event']['startDate']; + } + + // CHECKSUM + $checksumSeed = (string) ($data['checksum'] ?? ''); + $checksumSeed .= '|' . ($set['keywords'] ?? ''); + $checksumSeed .= '|' . ($set['priority'] ?? ''); + $checksumSeed .= '|' . ($set['imagepath'] ?? ''); + $checksumSeed .= '|' . ($set['startDate'] ?? ''); + + $set['checksum'] = md5($checksumSeed); + } + } + + /* + * ===================== + * DATEI-ERKENNUNG (NUR ERKENNUNG!) + * ===================== + */ + if ((int) ($data['protected'] ?? 0) !== 0) { + return; + } + + if (!Config::get('meilisearch_index_files')) { + return; + } + + $links = $this->findAllLinks($content); + $fileLinks = []; + + foreach ($links as $link) { + $type = $this->detectIndexableFileType($link['url']); + if ($type !== null) { + $fileLinks[] = $link + ['type' => $type]; + } + } + + $this->debug('Indexable file links found', [ + 'count' => count($fileLinks), + ]); + + if ($fileLinks) { + foreach ($fileLinks as $file) { + $this->fileHelper->collect( + $file['url'], + $file['type'], + (int) ($data['pid'] ?? 0) + ); + } + } + + $this->debug('Hook end', [ + 'final_set_keys' => array_keys($set), + ]); + } + + /* === Hilfsmethoden unverändert === */ + + private function extractMeilisearchJson(string $content): ?array + { + if (!preg_match('//s', $content, $m)) { + return null; + } + + $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1])); + $data = json_decode($json, true); + + return json_last_error() === JSON_ERROR_NONE && is_array($data) + ? $data + : null; + } + + private function findAllLinks(string $content): array + { + if (!preg_match_all( + '/]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is', + $content, + $matches + )) { + return []; + } + + $result = []; + + foreach ($matches[1] as $i => $href) { + $result[] = [ + 'url' => html_entity_decode($href), + 'linkText' => trim(strip_tags($matches[2][$i])) ?: null, + ]; + } + + return $result; + } + + private function detectIndexableFileType(string $url): ?string + { + $url = strtok($url, '#'); + $parts = parse_url($url); + + if (!empty($parts['path'])) { + $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); + if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { + return $ext; + } + } + + if (!empty($parts['query'])) { + parse_str($parts['query'], $query); + + foreach (['file', 'p', 'f'] as $param) { + if (!empty($query[$param])) { + $candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES)); + $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); + + if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { + return $ext; + } + } + } + } + + return null; + } +} \ No newline at end of file diff --git a/src/EventListener/MeilisearchPageMarkerListener.php b/src/EventListener/MeilisearchPageMarkerListener.php new file mode 100755 index 0000000..92a4a71 --- /dev/null +++ b/src/EventListener/MeilisearchPageMarkerListener.php @@ -0,0 +1,293 @@ +]*>(.*?)#si', $buffer, $m)) { + $mainHtml = $m[1]; + + if (preg_match( + '#meilisearch-uuid=["\']([a-f0-9-]{36})["\']#i', + $mainHtml, + $mm + )) { + $contentImageUuid = $mm[1]; + } + } + + /* + * ===================== + * KEYWORDS AUS FRONTEND (Catalog Manager) + * ===================== + */ + $frontendKeywords = []; + + if (preg_match( + '#]+id=["\']keywords["\'][^>]+meilisearch-keywords=["\']([^"\']+)["\']#i', + $buffer, + $m + )) { + $frontendKeywords = preg_split('/\s+/', trim($m[1])); + } + + /* + * ===================== + * PAGE (Basisdaten) + * ===================== + */ + $pageImageUuid = null; + + if (isset($GLOBALS['objPage']) && $GLOBALS['objPage'] instanceof PageModel) { + $page = $GLOBALS['objPage']; + + $data['page'] = []; + + if (!empty($page->priority)) { + $data['page']['priority'] = (int) $page->priority; + } + + if (!empty($page->keywords)) { + $data['page']['keywords'] = trim((string) $page->keywords); + } + + if (!empty($page->searchimage)) { + $raw = (string) $page->searchimage; + + if (preg_match('/^[a-f0-9-]{36}$/i', $raw)) { + $pageImageUuid = $raw; + } else { + try { + $pageImageUuid = StringUtil::binToUuid($raw); + } catch (\Throwable) {} + } + } + } + + /* + * ===================== + * JSON-LD AUSWERTEN + * ===================== + */ + preg_match_all( + '##s', + $buffer, + $matches + ); + + foreach ($matches[1] as $jsonRaw) { + $json = json_decode($jsonRaw, true); + if (!is_array($json)) { + continue; + } + + $graph = $json['@graph'] ?? []; + if (!is_array($graph)) { + continue; + } + + foreach ($graph as $entry) { + + /* + * EVENT + */ + if (($entry['@type'] ?? null) === 'Event' && !empty($entry['@id'])) { + if (preg_match('#/schema/events/(\d+)#', $entry['@id'], $m)) { + $event = CalendarEventsModel::findByPk((int) $m[1]); + + if ($event !== null) { + $data['event'] = []; + + if (!empty($event->priority)) { + $data['event']['priority'] = (int) $event->priority; + } + + if (!empty($event->keywords)) { + $data['event']['keywords'] = trim((string) $event->keywords); + } + + if ($event->addImage && !empty($event->singleSRC)) { + $data['event']['searchimage'] = StringUtil::binToUuid($event->singleSRC); + } + + if (!empty($event->startDate)) { + $data['event']['startDate'] = (int) $event->startDate; + } + } + } + } + + /* + * NEWS + */ + if (($entry['@type'] ?? null) === 'NewsArticle' && !empty($entry['@id'])) { + if (preg_match('#/schema/news/(\d+)#', $entry['@id'], $m)) { + $news = NewsModel::findByPk((int) $m[1]); + + if ($news !== null) { + $data['news'] = []; + + if (!empty($news->priority)) { + $data['news']['priority'] = (int) $news->priority; + } + + if (!empty($news->keywords)) { + $data['news']['keywords'] = trim((string) $news->keywords); + } + + if ($news->addImage && !empty($news->singleSRC)) { + $data['news']['searchimage'] = StringUtil::binToUuid($news->singleSRC); + } + } + } + } + } + } + + /* + * ===================== + * KEYWORDS ZUSAMMENFÜHREN + * ===================== + */ + $allKeywords = []; + + if (!empty($data['page']['keywords'])) { + $allKeywords = array_merge( + $allKeywords, + preg_split('/\s+/', $data['page']['keywords']) + ); + } + + if (!empty($data['event']['keywords'])) { + $allKeywords = array_merge( + $allKeywords, + preg_split('/\s+/', $data['event']['keywords']) + ); + } + + if (!empty($data['news']['keywords'])) { + $allKeywords = array_merge( + $allKeywords, + preg_split('/\s+/', $data['news']['keywords']) + ); + } + + if (!empty($frontendKeywords)) { + $allKeywords = array_merge($allKeywords, $frontendKeywords); + } + + $allKeywords = array_unique( + array_filter( + array_map('trim', $allKeywords) + ) + ); + + if ($allKeywords !== []) { + $data['page']['keywords'] = implode(' ', $allKeywords); + } + + /* + * ===================== + * FINALE SEARCHIMAGE-ENTSCHEIDUNG + * ===================== + */ + $finalSearchImageUuid = null; + + if ($contentImageUuid !== null) { + $finalSearchImageUuid = $contentImageUuid; + } + elseif (!empty($data['event']['searchimage'])) { + $finalSearchImageUuid = $data['event']['searchimage']; + } + elseif (!empty($data['news']['searchimage'])) { + $finalSearchImageUuid = $data['news']['searchimage']; + } + elseif ($pageImageUuid) { + $finalSearchImageUuid = $pageImageUuid; + } + else { + $fallback = Config::get('meilisearch_fallback_image'); + if ($fallback) { + $finalSearchImageUuid = $fallback; + } + } + + if ($finalSearchImageUuid !== null) { + $data['page'] ??= []; + $data['page']['searchimage'] = $finalSearchImageUuid; + } + + if ($data === []) { + return $buffer; + } + + /* + * ===================== + * META-SPAN + * ===================== + */ + $metaParts = []; + + if (!empty($data['page']['priority'])) { + $metaParts[] = 'page_priority=' . $data['page']['priority']; + } + if (!empty($data['page']['keywords'])) { + $metaParts[] = 'page_keywords=' . $data['page']['keywords']; + } + if (!empty($data['page']['searchimage'])) { + $metaParts[] = 'page_searchimage=' . $data['page']['searchimage']; + } + if ($contentImageUuid) { + $metaParts[] = 'content_searchimage=' . $contentImageUuid; + } + if (!empty($data['event']['startDate'])) { + $metaParts[] = 'event_startDate=' . $data['event']['startDate']; + } + + $hiddenMeta = + "\n" . + "⟦MEILISEARCH_META⟧ " . + htmlspecialchars(implode(' | ', $metaParts), ENT_QUOTES) . + " ⟦/MEILISEARCH_META⟧" . + "\n"; + + $marker = + "\n\n"; + + $injection = $hiddenMeta . $marker; + + return str_contains($buffer, '') + ? str_replace('', $injection . '', $buffer) + : $buffer . $injection; + } +} \ No newline at end of file diff --git a/src/Resources/config/services.yaml b/src/Resources/config/services.yaml new file mode 100755 index 0000000..62fba82 --- /dev/null +++ b/src/Resources/config/services.yaml @@ -0,0 +1,26 @@ +services: + # Alias MUSS vorhanden sein (richtig platziert) + Psr\Container\ContainerInterface: '@service_container' + + Mummert\ContaoMeilisearchBundle\: + resource: '../../{Command,EventListener,Service}' + autowire: true + autoconfigure: true + + Mummert\ContaoMeilisearchBundle\EventListener\MeilisearchPageMarkerListener: + autowire: true + autoconfigure: false + tags: + - { name: contao.hook, hook: outputFrontendTemplate, method: onOutputFrontendTemplate } + + Mummert\ContaoMeilisearchBundle\EventListener\IndexPageListener: + autowire: true + autoconfigure: false + tags: + - { name: contao.hook, hook: indexPage, method: onIndexPage } + + Mummert\ContaoMeilisearchBundle\Controller\FrontendModule\MeilisearchSearchController: + autowire: true + autoconfigure: false + tags: + - { name: contao.frontend_module, type: meilisearch_search, category: search } \ No newline at end of file diff --git a/src/Resources/contao/config/config.php b/src/Resources/contao/config/config.php new file mode 100755 index 0000000..a9091a4 --- /dev/null +++ b/src/Resources/contao/config/config.php @@ -0,0 +1,10 @@ +addLegend('meilisearch_legend', 'pal_expert_legend', PaletteManipulator::POSITION_AFTER) + ->addField('priority', 'meilisearch_legend') + ->addField('keywords', 'meilisearch_legend') + ->applyToPalette('default', 'tl_calendar_events'); + +/** + * Priority + */ +$dca['fields']['priority'] = [ + 'inputType' => 'select', + 'options' => [1, 2, 3], + 'reference' => &$GLOBALS['TL_LANG']['MSC']['meilisearch_priority'], + 'default' => 2, + 'eval' => ['tl_class' => 'w50'], + 'sql' => "int(1) NOT NULL default '2'" +]; + +/** + * Keywords + */ +$dca['fields']['keywords'] = [ + 'inputType' => 'text', + 'eval' => ['tl_class' => 'w50', 'maxlength' => 255], + 'sql' => "varchar(255) NOT NULL default ''" +]; \ No newline at end of file diff --git a/src/Resources/contao/dca/tl_module.php b/src/Resources/contao/dca/tl_module.php new file mode 100755 index 0000000..986ca1b --- /dev/null +++ b/src/Resources/contao/dca/tl_module.php @@ -0,0 +1,19 @@ + &$GLOBALS['TL_LANG']['tl_module']['meiliLimit'], + 'inputType' => 'text', + 'default' => 50, + 'eval' => [ + 'rgxp' => 'digit', + 'mandatory' => true, + 'tl_class' => 'w50', + ], + 'sql' => "int(10) unsigned NOT NULL default 50", +]; \ No newline at end of file diff --git a/src/Resources/contao/dca/tl_news.php b/src/Resources/contao/dca/tl_news.php new file mode 100755 index 0000000..fd5f7f5 --- /dev/null +++ b/src/Resources/contao/dca/tl_news.php @@ -0,0 +1,32 @@ +addLegend('meilisearch_legend', 'pal_expert_legend', PaletteManipulator::POSITION_AFTER) + ->addField('priority', 'meilisearch_legend') + ->addField('keywords', 'meilisearch_legend') + ->applyToPalette('default', 'tl_news'); + +/** + * Priority + */ +$dca['fields']['priority'] = [ + 'inputType' => 'select', + 'options' => [1, 2, 3], + 'reference' => &$GLOBALS['TL_LANG']['MSC']['meilisearch_priority'], + 'default' => 2, + 'eval' => ['tl_class' => 'w50'], + 'sql' => "int(1) NOT NULL default '2'" +]; + +/** + * Keywords + */ +$dca['fields']['keywords'] = [ + 'inputType' => 'text', + 'eval' => ['tl_class' => 'w50', 'maxlength' => 255], + 'sql' => "varchar(255) NOT NULL default ''" +]; \ No newline at end of file diff --git a/src/Resources/contao/dca/tl_page.php b/src/Resources/contao/dca/tl_page.php new file mode 100755 index 0000000..e76b79c --- /dev/null +++ b/src/Resources/contao/dca/tl_page.php @@ -0,0 +1,46 @@ +addLegend('meilisearch_legend', 'pal_expert_legend', PaletteManipulator::POSITION_AFTER) + ->addField('priority', 'meilisearch_legend') + ->addField('keywords', 'meilisearch_legend') + ->addField('searchimage', 'meilisearch_legend') + ->applyToPalette('regular', 'tl_page'); + +/** + * Priority + */ +$dca['fields']['priority'] = [ + 'inputType' => 'select', + 'options' => [1, 2, 3], + 'reference' => &$GLOBALS['TL_LANG']['MSC']['meilisearch_priority'], + 'default' => 2, + 'eval' => ['tl_class' => 'w50'], + 'sql' => "int(1) NOT NULL default '2'" +]; + +/** + * Keywords + */ +$dca['fields']['keywords'] = [ + 'inputType' => 'text', + 'eval' => ['tl_class' => 'w50', 'maxlength' => 255], + 'sql' => "varchar(255) NOT NULL default ''" +]; + +/** + * Search image + */ +$dca['fields']['searchimage'] = [ + 'inputType' => 'fileTree', + 'eval' => [ + 'tl_class' => 'w50', + 'filesOnly' => true, + 'fieldType' => 'radio' + ], + 'sql' => "varbinary(16) NULL" +]; \ No newline at end of file diff --git a/src/Resources/contao/dca/tl_search.php b/src/Resources/contao/dca/tl_search.php new file mode 100755 index 0000000..0456b66 --- /dev/null +++ b/src/Resources/contao/dca/tl_search.php @@ -0,0 +1,36 @@ + ['Keywords', 'Suchbegriffe für die Indexierung'], + 'exclude' => true, + 'inputType' => 'text', + 'eval' => ['tl_class' => 'w50', 'maxlength' => 255], + 'sql' => "varchar(255) NOT NULL default ''", +]; + +$GLOBALS['TL_DCA']['tl_search']['fields']['priority'] = [ + 'label' => ['Priorität', 'Priorität für die Suchergebnisse'], + 'exclude' => true, + 'inputType' => 'select', + 'options' => [1, 2, 3], + 'eval' => ['tl_class' => 'w50'], + 'sql' => "int(1) NOT NULL default '2'", +]; + +$GLOBALS['TL_DCA']['tl_search']['fields']['imagepath'] = [ + 'label' => ['Suchbild', 'UUID des Suchbildes'], + 'exclude' => true, + 'inputType' => 'text', + 'eval' => ['maxlength' => 512], + 'sql' => "varchar(512) NOT NULL default ''", +]; + +$GLOBALS['TL_DCA']['tl_search']['fields']['startDate'] = [ + 'label' => ['Startdatum', 'Startdatum für die Suchergebnisse (Unix-Timestamp)'], + 'exclude' => true, + 'inputType' => 'text', + 'eval' => ['tl_class' => 'w50', 'rgxp' => 'digit'], + 'sql' => "bigint(20) NOT NULL default '0'", +]; diff --git a/src/Resources/contao/dca/tl_search_files.php b/src/Resources/contao/dca/tl_search_files.php new file mode 100755 index 0000000..b8c7a9c --- /dev/null +++ b/src/Resources/contao/dca/tl_search_files.php @@ -0,0 +1,95 @@ + [ + 'dataContainer' => DC_Table::class, + 'sql' => [ + 'keys' => [ + 'id' => 'primary', + 'page_id' => 'index', + 'url' => 'unique', + 'type' => 'index', + 'checksum' => 'index', + 'uuid' => 'index', + 'last_seen' => 'index', + ], + ], + ], + + 'fields' => [ + 'id' => [ + 'sql' => "int(10) unsigned NOT NULL auto_increment", + ], + + 'tstamp' => [ + 'sql' => "int(10) unsigned NOT NULL default 0", + ], + + /* + * Zeitpunkt, wann die Datei zuletzt beim Crawl gesehen wurde + * → Basis für Cleanup + */ + 'last_seen' => [ // ⬅️ NEU + 'sql' => "int(10) unsigned NOT NULL default 0", + ], + + /* + * Dateityp: pdf | docx | xlsx | pptx + */ + 'type' => [ + 'sql' => "varchar(16) NOT NULL default 'pdf'", + ], + + /* + * Absolute oder normalisierte Datei-URL + * z. B. /files/pdf/foo.pdf + */ + 'url' => [ + 'sql' => "varchar(1024) NOT NULL default ''", + ], + + /* + * Linktext oder Dateiname + */ + 'title' => [ + 'sql' => "varchar(255) NOT NULL default ''", + ], + + /* + * Geparster Datei-Text (PDF / Office) + */ + 'text' => [ + 'sql' => "mediumtext NULL", + ], + + 'uuid' => [ + 'sql' => "binary(16) NULL", + ], + + /* + * md5(url + filemtime) + * → erkennt Änderungen zuverlässig + */ + 'checksum' => [ + 'sql' => "char(32) NOT NULL default ''", + ], + + /* + * Herkunftsseite (tl_page.id) + * → optional, Debug / Referenz + */ + 'page_id' => [ + 'sql' => "int(10) unsigned NOT NULL default 0", + ], + + /* + * Dateizeitstempel + * → wichtig für Re-Indexierung + */ + 'file_mtime' => [ + 'sql' => "int(10) unsigned NOT NULL default 0", + ], + ], +]; \ No newline at end of file diff --git a/src/Resources/contao/dca/tl_settings.php b/src/Resources/contao/dca/tl_settings.php new file mode 100755 index 0000000..d284ef3 --- /dev/null +++ b/src/Resources/contao/dca/tl_settings.php @@ -0,0 +1,136 @@ + 'text', + 'eval' => [ + 'mandatory' => true, + 'rgxp' => 'url', + 'tl_class' => 'w50', + ], +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_index'] = [ + 'inputType' => 'text', + 'eval' => [ + 'mandatory' => true, + 'tl_class' => 'w50', + ], +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_api_write'] = [ + 'inputType' => 'text', + 'eval' => [ + 'mandatory' => true, + 'tl_class' => 'w50', + 'hideInput' => true, + ], +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_api_search'] = [ + 'inputType' => 'text', + 'eval' => [ + 'mandatory' => true, + 'tl_class' => 'w50', + 'hideInput' => true, + ], +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_imagesize'] = [ + 'inputType' => 'select', + 'options_callback' => static function () { + $db = System::getContainer()->get('database_connection'); + $rows = $db->fetchAllAssociative('SELECT id, name FROM tl_image_size ORDER BY name'); + + $options = ['' => '-']; + foreach ($rows as $row) { + $options[$row['id']] = $row['name'] . ' (ID ' . $row['id'] . ')'; + } + + return $options; + }, + 'eval' => [ + 'tl_class' => 'w50', + 'chosen' => true, + 'includeBlankOption' => true, + ], + 'sql' => "int(10) unsigned NOT NULL default 0", +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_fallback_image'] = [ + 'inputType' => 'fileTree', + 'eval' => [ + 'filesOnly' => true, + 'fieldType' => 'radio', + 'tl_class' => 'w50', + ], + 'sql' => "varbinary(16) NULL", +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_index_past_events'] = [ + 'inputType' => 'checkbox', + 'eval' => [ + 'tl_class' => 'w50 clr', + ], +]; + +/** + * ------------------------------------------------- + * Datei-Indexierung (Tika) + * ------------------------------------------------- + */ + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_index_files'] = [ + 'inputType' => 'checkbox', + 'eval' => [ + 'tl_class' => 'w50', + 'submitOnChange' => true, + ], + 'sql' => "char(1) NOT NULL default '0'", +]; + +$GLOBALS['TL_DCA']['tl_settings']['fields']['meilisearch_tika_url'] = [ + 'inputType' => 'text', + 'eval' => [ + 'rgxp' => 'url', + 'mandatory' => true, + 'tl_class' => 'w50 clr', + ], +]; + +/** + * ------------------------------------------------- + * Selector / Subpalette + * ------------------------------------------------- + */ + +$GLOBALS['TL_DCA']['tl_settings']['palettes']['__selector__'][] = 'meilisearch_index_files'; + +$GLOBALS['TL_DCA']['tl_settings']['subpalettes']['meilisearch_index_files'] + = 'meilisearch_tika_url'; + +/** + * ------------------------------------------------- + * Palette + * ------------------------------------------------- + */ + +PaletteManipulator::create() + ->addLegend('meilisearch_legend', null, PaletteManipulator::POSITION_AFTER, true) + ->addField('meilisearch_host', 'meilisearch_legend') + ->addField('meilisearch_index', 'meilisearch_legend') + ->addField('meilisearch_api_write', 'meilisearch_legend') + ->addField('meilisearch_api_search', 'meilisearch_legend') + ->addField('meilisearch_imagesize', 'meilisearch_legend') + ->addField('meilisearch_fallback_image', 'meilisearch_legend') + ->addField('meilisearch_index_past_events', 'meilisearch_legend') + ->addField('meilisearch_index_files', 'meilisearch_legend') + ->applyToPalette('default', 'tl_settings'); \ No newline at end of file diff --git a/src/Resources/contao/languages/de/default.php b/src/Resources/contao/languages/de/default.php new file mode 100755 index 0000000..11f1945 --- /dev/null +++ b/src/Resources/contao/languages/de/default.php @@ -0,0 +1,7 @@ + 'Niedrig', + 2 => 'Standard', + 3 => 'Hoch', +]; \ No newline at end of file diff --git a/src/Resources/contao/languages/de/modules.php b/src/Resources/contao/languages/de/modules.php new file mode 100755 index 0000000..5767e2a --- /dev/null +++ b/src/Resources/contao/languages/de/modules.php @@ -0,0 +1,7 @@ + +{% block meilisearch %} + + + +{% endblock %} + \ No newline at end of file diff --git a/src/Resources/public/icons/filetype-docx.svg b/src/Resources/public/icons/filetype-docx.svg new file mode 100755 index 0000000..1b6c172 --- /dev/null +++ b/src/Resources/public/icons/filetype-docx.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/Resources/public/icons/filetype-pdf.svg b/src/Resources/public/icons/filetype-pdf.svg new file mode 100755 index 0000000..e8bb772 --- /dev/null +++ b/src/Resources/public/icons/filetype-pdf.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/Resources/public/icons/filetype-pptx.svg b/src/Resources/public/icons/filetype-pptx.svg new file mode 100755 index 0000000..f68e939 --- /dev/null +++ b/src/Resources/public/icons/filetype-pptx.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/Resources/public/icons/filetype-xlsx.svg b/src/Resources/public/icons/filetype-xlsx.svg new file mode 100755 index 0000000..5202bf7 --- /dev/null +++ b/src/Resources/public/icons/filetype-xlsx.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/Service/MeilisearchFileHelper.php b/src/Service/MeilisearchFileHelper.php new file mode 100755 index 0000000..8a328c8 --- /dev/null +++ b/src/Service/MeilisearchFileHelper.php @@ -0,0 +1,259 @@ +log('collect() start', [ + 'url' => $url, + 'type' => $type, + 'pageId' => $pageId, + ]); + + // ------------------------------------------------- + // 1. URL normalisieren + // ------------------------------------------------- + $cleanUrl = strtok($url, '#'); + $parts = parse_url($cleanUrl); + + if (!$parts) { + $this->log('Invalid URL, skip'); + return; + } + + // ------------------------------------------------- + // 2. Externe Datei? → skip + // ------------------------------------------------- + if (!empty($parts['host'])) { + $currentRequest = System::getContainer() + ->get('request_stack') + ->getCurrentRequest(); + + $pageHost = $currentRequest + ? parse_url($currentRequest->getSchemeAndHttpHost(), PHP_URL_HOST) + : null; + + if ($pageHost && $parts['host'] !== $pageHost) { + $this->log('External file detected, skip', [ + 'host' => $parts['host'], + ]); + return; + } + } + + // ------------------------------------------------- + // 3. Pfad-Kandidaten sammeln (ohne Annahmen!) + // ------------------------------------------------- + $query = []; + if (!empty($parts['query'])) { + parse_str($parts['query'], $query); + } + + $pathCandidates = []; + + // direkter Pfad + if (!empty($parts['path'])) { + $pathCandidates[] = $parts['path']; + } + + // Download-Parameter + foreach (['file', 'f', 'p'] as $param) { + if (!empty($query[$param])) { + $pathCandidates[] = $query[$param]; + } + } + + // normalisieren + $pathCandidates = array_values(array_unique(array_filter(array_map( + static function ($candidate) { + $candidate = rawurldecode(html_entity_decode((string) $candidate, ENT_QUOTES)); + return ltrim($candidate, '/') ?: null; + }, + $pathCandidates + )))); + + $this->log('Path candidates (normalized)', [ + 'candidates' => $pathCandidates, + ]); + + // ------------------------------------------------- + // 4. FilesModel (DBAFS) auflösen → UUID + // ------------------------------------------------- + $fileModel = null; + + foreach ($pathCandidates as $candidate) { + + // 1) direkt + $model = FilesModel::findByPath($candidate); + if ($model && $model->uuid) { + $fileModel = $model; + $this->log('Resolved via FilesModel (direct)', [ + 'candidate' => $candidate, + 'path' => $model->path, + ]); + break; + } + + // 2) fallback: files/ davor + if (!str_starts_with($candidate, 'files/')) { + $model = FilesModel::findByPath('files/' . $candidate); + if ($model && $model->uuid) { + $fileModel = $model; + $this->log('Resolved via FilesModel (files/ prefix)', [ + 'candidate' => $candidate, + 'path' => $model->path, + ]); + break; + } + } + } + + if (!$fileModel) { + $this->log('No Contao file model found, skip', [ + 'candidates' => $pathCandidates, + ]); + return; + } + + $normalizedPath = (string) $fileModel->path; + $uuidBin = $fileModel->uuid; + $uuid = StringUtil::binToUuid($uuidBin); + $canonicalUrl = '/' . ltrim($normalizedPath, '/'); + + $this->log('UUID resolved', [ + 'path' => $canonicalUrl, + 'uuid' => $uuid, + ]); + + // ------------------------------------------------- + // 5. Datei im Filesystem prüfen + // ------------------------------------------------- + $projectDir = System::getContainer()->getParameter('kernel.project_dir'); + $abs = $projectDir . '/public/' . $normalizedPath; + + if (!is_file($abs)) { + $this->log('Resolved model but file missing on filesystem, skip', [ + 'path' => $normalizedPath, + 'abs' => $abs, + ]); + return; + } + +// ------------------------------------------------- +// 6. Redaktionellen Titel aus tl_files.meta +// ------------------------------------------------- + $title = null; + $meta = StringUtil::deserialize($fileModel->meta, true); + +// 1) bevorzugte Sprache (falls vorhanden) + $lang = $GLOBALS['TL_LANGUAGE'] ?? null; + if ($lang && !empty($meta[$lang]['title'])) { + $title = trim((string) $meta[$lang]['title']); + } + +// 2) Fallback: erste verfügbare Sprache + if ($title === null && is_array($meta)) { + foreach ($meta as $langKey => $langMeta) { + if (!empty($langMeta['title'])) { + $title = trim((string) $langMeta['title']); + break; + } + } + } + + if ($title) { + $this->log('Title resolved from tl_files', [ + 'title' => $title, + ]); + } + + // ------------------------------------------------- + // 7. Datei-Infos + // ------------------------------------------------- + $mtime = filemtime($abs) ?: 0; + $checksum = md5($normalizedPath . '|' . $mtime); + $now = time(); + + // ------------------------------------------------- + // 8. Upsert über UUID + // ------------------------------------------------- + $existing = $this->connection->fetchAssociative( + 'SELECT id FROM tl_search_files WHERE uuid = ?', + [$uuidBin] + ); + + if ($existing) { + $data = [ + 'tstamp' => $now, + 'last_seen' => $now, + 'type' => $type, + 'url' => $canonicalUrl, + 'page_id' => $pageId, + 'file_mtime' => $mtime, + 'checksum' => $checksum, + ]; + + if ($title !== null) { + $data['title'] = $title; + } + + $this->connection->update( + 'tl_search_files', + $data, + ['id' => $existing['id']] + ); + + $this->log('File updated by UUID', [ + 'uuid' => $uuid, + ]); + } else { + $this->connection->insert( + 'tl_search_files', + [ + 'tstamp' => $now, + 'last_seen' => $now, + 'type' => $type, + 'url' => $canonicalUrl, + 'title' => $title ?? basename($normalizedPath), + 'page_id' => $pageId, + 'file_mtime' => $mtime, + 'checksum' => $checksum, + 'uuid' => $uuidBin, + ] + ); + + $this->log('File inserted by UUID', [ + 'uuid' => $uuid, + ]); + } + + $this->log('collect() end'); + } + + // ------------------------------------------------- + // Logging + // ------------------------------------------------- + private function log(string $message, array $context = []): void + { + $ctx = $context + ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) + : ''; + + error_log('[ContaoMeilisearch][MeilisearchFileHelper] ' . $message . $ctx); + } +} \ No newline at end of file diff --git a/src/Service/MeilisearchImageHelper.php b/src/Service/MeilisearchImageHelper.php new file mode 100755 index 0000000..c42226f --- /dev/null +++ b/src/Service/MeilisearchImageHelper.php @@ -0,0 +1,84 @@ +framework->initialize(); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] ImageHelper: Framework init failed: ' . $e->getMessage()); + return null; + } + + /** @var FilesModel|null $file */ + try { + $file = FilesModel::findByUuid($uuid); + } catch (\Throwable $e) { + error_log( + '[ContaoMeilisearch] ImageHelper: FilesModel lookup failed (' . $uuid . '): ' . $e->getMessage() + ); + return null; + } + + if (!$file) { + error_log('[ContaoMeilisearch] ImageHelper: File not found for UUID ' . $uuid); + return null; + } + + // ImageSize aus tl_settings + $imageSizeId = (int) Config::get('meilisearch_imagesize'); + + // Fallback: Originaldatei + if ($imageSizeId <= 0) { + return $file->path; + } + + try { + $figure = $this->studio + ->createFigureBuilder() + ->from($file->path) + ->setSize($imageSizeId) + ->build(); + + $image = $figure->getImage(); + + if ($image === null) { + error_log( + '[ContaoMeilisearch] ImageHelper: Image generation failed for ' . $file->path + ); + return null; + } + + return $image->getImageSrc() ?: null; + + } catch (\Throwable $e) { + error_log( + '[ContaoMeilisearch] ImageHelper: Image processing failed for ' + . $file->path . ': ' . $e->getMessage() + ); + return null; + } + } +} \ No newline at end of file diff --git a/src/Service/MeilisearchIndexService.php b/src/Service/MeilisearchIndexService.php new file mode 100755 index 0000000..0125241 --- /dev/null +++ b/src/Service/MeilisearchIndexService.php @@ -0,0 +1,271 @@ + '/bundles/contaomeilisearch/icons/filetype-pdf.svg', + 'docx' => '/bundles/contaomeilisearch/icons/filetype-docx.svg', + 'xlsx' => '/bundles/contaomeilisearch/icons/filetype-xlsx.svg', + 'pptx' => '/bundles/contaomeilisearch/icons/filetype-pptx.svg', + ]; + + public function __construct( + private readonly Connection $connection, + private readonly ContaoFramework $framework, + private readonly MeilisearchImageHelper $imageHelper, + ) {} + + /** + * Entry point for command & cron + */ + public function run(): void + { + try { + $this->framework->initialize(); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Framework initialization failed: ' . $e->getMessage()); + return; + } + + $host = (string) Config::get('meilisearch_host'); + $apiKey = (string) Config::get('meilisearch_api_write'); + $this->indexName = (string) Config::get('meilisearch_index'); + + if ($host === '' || $this->indexName === '') { + error_log('[ContaoMeilisearch] Meilisearch is not configured in tl_settings.'); + return; + } + + try { + $this->client = new Client($host, $apiKey); + $index = $this->client->index($this->indexName); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to connect to Meilisearch: ' . $e->getMessage()); + return; + } + + try { + $this->ensureIndexSettings($index); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to update index settings: ' . $e->getMessage()); + } + + try { + $index->deleteAllDocuments(); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to delete documents: ' . $e->getMessage()); + return; + } + + $this->indexTlSearch($index); + $this->indexTlSearchFiles($index); + } + + private function ensureIndexSettings(Indexes $index): void + { + $index->updateSettings([ + 'searchableAttributes' => ['title', 'keywords', 'text'], + 'sortableAttributes' => ['priority', 'startDate'], + 'filterableAttributes' => ['type', 'filetype'], + ]); + } + + /** + * ⛔ MEILISEARCH_META aus Text entfernen + */ + private function stripMeilisearchMeta(string $text): string + { + $text = preg_replace( + '/⟦MEILISEARCH_META⟧.*?⟦\/MEILISEARCH_META⟧/su', + '', + $text + ); + + $text = preg_replace('/\s{2,}/u', ' ', $text); + $text = preg_replace('/\n{2,}/u', "\n", $text); + + return trim($text); + } + + /** + * startDate aus schema.org Event extrahieren + */ + private function extractEventStartDate(?string $meta): ?int + { + if (!$meta) { + return null; + } + + $data = json_decode($meta, true); + if (!is_array($data)) { + return null; + } + + foreach ($data as $entry) { + if (($entry['@type'] ?? null) !== 'https://schema.org/Event') { + continue; + } + + if (!empty($entry['https://schema.org/startDate'])) { + return strtotime($entry['https://schema.org/startDate']) ?: null; + } + + if (!empty($entry['startDate'])) { + return strtotime($entry['startDate']) ?: null; + } + } + + return null; + } + + /** + * tl_search indexieren (Seiten / News / Events) + */ + private function indexTlSearch(Indexes $index): void + { + try { + $rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search'); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to read tl_search: ' . $e->getMessage()); + return; + } + + if (!$rows) { + return; + } + + $indexPastEvents = (bool) Config::get('meilisearch_index_past_events'); + $today = strtotime('today'); + + $documents = []; + + foreach ($rows as $row) { + try { + $type = $this->detectTypeFromMeta($row['meta'] ?? null); + + $eventStart = null; + if ($type === 'event') { + $eventStart = $this->extractEventStartDate($row['meta'] ?? null); + if (!$indexPastEvents && $eventStart !== null && $eventStart < $today) { + continue; + } + } + + $doc = [ + 'id' => $type . '_' . $row['id'], + 'type' => $type, + 'title' => $row['title'], + 'text' => $this->stripMeilisearchMeta((string) $row['text']), + 'url' => $row['url'], + 'protected' => (bool) $row['protected'], + 'checksum' => $row['checksum'], + 'keywords' => (string) ($row['keywords'] ?? ''), + 'priority' => (int) ($row['priority'] ?? 0), + ]; + + if ($eventStart !== null) { + $doc['startDate'] = $eventStart; + } + + if (!empty($row['imagepath'])) { + $imagePath = $this->imageHelper->resolveImagePath($row['imagepath']); + if ($imagePath !== null) { + $doc['poster'] = $imagePath; + } + } + + $documents[] = $doc; + + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to build tl_search document: ' . $e->getMessage()); + } + } + + if ($documents !== []) { + $index->addDocuments($documents); + } + } + + /** + * tl_search_files indexieren (PDF / Office) + */ + private function indexTlSearchFiles(Indexes $index): void + { + try { + $rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search_files'); + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to read tl_search_files: ' . $e->getMessage()); + return; + } + + if (!$rows) { + return; + } + + $documents = []; + + foreach ($rows as $row) { + try { + $fileType = in_array($row['type'], ['pdf', 'docx', 'xlsx', 'pptx'], true) + ? $row['type'] + : 'pdf'; + + $documents[] = [ + 'id' => 'file_' . $row['id'], + 'type' => 'file', + 'filetype' => $fileType, + 'title' => $row['title'] ?: basename($row['url']), + 'text' => (string) $row['text'], + 'url' => $row['url'], + 'checksum' => $row['checksum'], + 'poster' => self::FILETYPE_ICON_MAP[$fileType] + ?? self::FILETYPE_ICON_MAP['pdf'], + ]; + + } catch (\Throwable $e) { + error_log('[ContaoMeilisearch] Failed to build file document: ' . $e->getMessage()); + } + } + + if ($documents !== []) { + $index->addDocuments($documents); + } + } + + private function detectTypeFromMeta(?string $meta): string + { + if (!$meta) { + return 'page'; + } + + $data = json_decode($meta, true); + if (!is_array($data)) { + return 'page'; + } + + foreach ($data as $entry) { + if (($entry['@type'] ?? null) === 'https://schema.org/Event') { + return 'event'; + } + if (($entry['@type'] ?? null) === 'https://schema.org/NewsArticle') { + return 'news'; + } + } + + return 'page'; + } +} \ No newline at end of file