diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index 75dec8c..e9ac5bc 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -41,21 +41,18 @@ class IndexPageListener
if (is_array($parsed)) {
- /*
- * PRIORITY
- */
+ // PRIORITY
$priority =
- $parsed['event']['priority'] ?? null ??
- $parsed['news']['priority'] ?? null ??
- $parsed['page']['priority'] ?? null;
+ $parsed['event']['priority']
+ ?? $parsed['news']['priority']
+ ?? $parsed['page']['priority']
+ ?? null;
if ($priority !== null && $priority !== '') {
$set['priority'] = (int) $priority;
}
- /*
- * KEYWORDS
- */
+ // KEYWORDS
$keywordSources = [
$parsed['event']['keywords'] ?? null,
$parsed['news']['keywords'] ?? null,
@@ -67,11 +64,8 @@ class IndexPageListener
if (!is_string($src) || trim($src) === '') {
continue;
}
-
foreach (preg_split('/\s+/', trim($src)) as $word) {
- if ($word !== '') {
- $keywords[] = $word;
- }
+ $keywords[] = $word;
}
}
@@ -79,33 +73,22 @@ class IndexPageListener
$set['keywords'] = implode(' ', array_unique($keywords));
}
- /*
- * IMAGEPATH (UUID)
- */
- if (
- isset($parsed['page']['searchimage'])
- && is_string($parsed['page']['searchimage'])
- && $parsed['page']['searchimage'] !== ''
- ) {
- $set['imagepath'] = trim($parsed['page']['searchimage']);
+ // IMAGEPATH
+ if (!empty($parsed['page']['searchimage'])) {
+ $set['imagepath'] = trim((string) $parsed['page']['searchimage']);
}
- /*
- * STARTDATE (Unix Timestamp)
- */
+ // STARTDATE
$startDate =
- $parsed['event']['startDate'] ?? null ??
- $parsed['news']['startDate'] ?? null;
+ $parsed['event']['startDate']
+ ?? $parsed['news']['startDate']
+ ?? null;
if (is_numeric($startDate) && (int) $startDate > 0) {
$set['startDate'] = (int) $startDate;
}
- /*
- * =====================
- * CHECKSUM-FIX
- * =====================
- */
+ // CHECKSUM
try {
$checksumSeed = (string) ($data['checksum'] ?? '');
$checksumSeed .= '|' . ($set['keywords'] ?? '');
@@ -122,40 +105,51 @@ class IndexPageListener
/*
* =====================
- * PDF-INDEXIERUNG
+ * DATEI-INDEXIERUNG (PDF / OFFICE)
* =====================
*/
- if (
- (bool) Config::get('meilisearch_index_pdfs')
- && (int) ($data['protected'] ?? 0) === 0
- ) {
- try {
- $pdfLinks = $this->findPdfLinks($content);
- if ($pdfLinks !== []) {
- $this->pdfIndexService->handlePdfLinks($pdfLinks);
- }
- } catch (\Throwable $e) {
- error_log('[ContaoMeilisearch] PDF indexing failed: ' . $e->getMessage());
+ if ((int) ($data['protected'] ?? 0) !== 0) {
+ return;
+ }
+
+ $indexPdfs = (bool) Config::get('meilisearch_index_pdfs');
+ $indexOffice = (bool) Config::get('meilisearch_index_office_pdfs');
+
+ if (!$indexPdfs && !$indexOffice) {
+ return;
+ }
+
+ $links = $this->findAllLinks($content);
+
+ $pdfLinks = [];
+ $officeLinks = [];
+
+ foreach ($links as $link) {
+ $type = $this->detectIndexableFileType($link['url']);
+
+ if ($type === 'pdf' && $indexPdfs) {
+ $pdfLinks[] = $link;
+ continue;
+ }
+
+ if (
+ in_array($type, ['docx', 'xlsx', 'pptx'], true)
+ && $indexOffice
+ ) {
+ $officeLinks[] = $link;
}
}
- /*
- * =====================
- * OFFICE-INDEXIERUNG
- * =====================
- */
- if (
- (bool) Config::get('meilisearch_index_office')
- && (int) ($data['protected'] ?? 0) === 0
- ) {
- try {
- $officeLinks = $this->findOfficeLinks($content);
- if ($officeLinks !== []) {
- $this->officeIndexService->handleOfficeLinks($officeLinks);
- }
- } catch (\Throwable $e) {
- error_log('[ContaoMeilisearch] Office indexing failed: ' . $e->getMessage());
+ try {
+ if ($pdfLinks !== []) {
+ $this->pdfIndexService->handlePdfLinks($pdfLinks);
}
+
+ if ($officeLinks !== []) {
+ $this->officeIndexService->handleOfficeLinks($officeLinks);
+ }
+ } catch (\Throwable $e) {
+ error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage());
}
}
@@ -171,61 +165,73 @@ class IndexPageListener
$json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
$data = json_decode($json, true);
- if (json_last_error() !== JSON_ERROR_NONE) {
- error_log('[ContaoMeilisearch] Invalid MEILISEARCH_JSON: ' . json_last_error_msg());
+ return json_last_error() === JSON_ERROR_NONE && is_array($data)
+ ? $data
+ : null;
+ }
+
+ /**
+ * Sammle alle Links
+ */
+ private function findAllLinks(string $content): array
+ {
+ if (!preg_match_all(
+ '/]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is',
+ $content,
+ $matches
+ )) {
+ return [];
+ }
+
+ $result = [];
+
+ foreach ($matches[1] as $i => $href) {
+ $result[] = [
+ 'url' => html_entity_decode($href),
+ 'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
+ ];
+ }
+
+ return $result;
+ }
+
+ /**
+ * Ermittelt indexierbaren Dateityp (pdf|docx|xlsx|pptx) oder null
+ */
+ private function detectIndexableFileType(string $url): ?string
+ {
+ // Hash entfernen
+ $url = strtok($url, '#');
+
+ $parts = parse_url($url);
+ if (!$parts) {
return null;
}
- return is_array($data) ? $data : null;
- }
-
- /**
- * Findet PDF-Links im Content
- */
- private function findPdfLinks(string $content): array
- {
- if (!preg_match_all(
- '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
- $content,
- $matches
- )) {
- return [];
+ // direkter Pfad (/files/…)
+ if (!empty($parts['path'])) {
+ $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
+ if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
+ return $ext;
+ }
}
- $result = [];
+ // Query-Parameter (Contao 4 + 5)
+ if (!empty($parts['query'])) {
+ parse_str($parts['query'], $query);
- foreach ($matches[1] as $i => $href) {
- $result[] = [
- 'url' => html_entity_decode($href),
- 'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
- ];
+ foreach (['file', 'p', 'f'] as $param) {
+ if (!empty($query[$param])) {
+ $candidate = urldecode((string) $query[$param]);
+ $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
+
+ if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
+ return $ext;
+ }
+ }
+ }
}
- return $result;
- }
-
- /**
- * Findet Office-Links (docx, xlsx, pptx)
- */
- private function findOfficeLinks(string $content): array
- {
- if (!preg_match_all(
- '/]*href=["\']([^"\']*(?:\.(?:docx|xlsx|pptx)|p=(?:docx|xlsx|pptx)(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
- $content,
- $matches
- )) {
- return [];
- }
-
- $result = [];
-
- foreach ($matches[1] as $i => $href) {
- $result[] = [
- 'url' => html_entity_decode($href),
- 'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
- ];
- }
-
- return $result;
+ return null;
}
}
\ No newline at end of file