This commit is contained in:
Jürgen Mummert
2025-12-28 11:12:10 +01:00
parent 0cf9703a9b
commit 31def907f6
+114 -108
View File
@@ -41,21 +41,18 @@ class IndexPageListener
if (is_array($parsed)) { if (is_array($parsed)) {
/* // PRIORITY
* PRIORITY
*/
$priority = $priority =
$parsed['event']['priority'] ?? null ?? $parsed['event']['priority']
$parsed['news']['priority'] ?? null ?? ?? $parsed['news']['priority']
$parsed['page']['priority'] ?? null; ?? $parsed['page']['priority']
?? null;
if ($priority !== null && $priority !== '') { if ($priority !== null && $priority !== '') {
$set['priority'] = (int) $priority; $set['priority'] = (int) $priority;
} }
/* // KEYWORDS
* KEYWORDS
*/
$keywordSources = [ $keywordSources = [
$parsed['event']['keywords'] ?? null, $parsed['event']['keywords'] ?? null,
$parsed['news']['keywords'] ?? null, $parsed['news']['keywords'] ?? null,
@@ -67,11 +64,8 @@ class IndexPageListener
if (!is_string($src) || trim($src) === '') { if (!is_string($src) || trim($src) === '') {
continue; continue;
} }
foreach (preg_split('/\s+/', trim($src)) as $word) { foreach (preg_split('/\s+/', trim($src)) as $word) {
if ($word !== '') { $keywords[] = $word;
$keywords[] = $word;
}
} }
} }
@@ -79,33 +73,22 @@ class IndexPageListener
$set['keywords'] = implode(' ', array_unique($keywords)); $set['keywords'] = implode(' ', array_unique($keywords));
} }
/* // IMAGEPATH
* IMAGEPATH (UUID) if (!empty($parsed['page']['searchimage'])) {
*/ $set['imagepath'] = trim((string) $parsed['page']['searchimage']);
if (
isset($parsed['page']['searchimage'])
&& is_string($parsed['page']['searchimage'])
&& $parsed['page']['searchimage'] !== ''
) {
$set['imagepath'] = trim($parsed['page']['searchimage']);
} }
/* // STARTDATE
* STARTDATE (Unix Timestamp)
*/
$startDate = $startDate =
$parsed['event']['startDate'] ?? null ?? $parsed['event']['startDate']
$parsed['news']['startDate'] ?? null; ?? $parsed['news']['startDate']
?? null;
if (is_numeric($startDate) && (int) $startDate > 0) { if (is_numeric($startDate) && (int) $startDate > 0) {
$set['startDate'] = (int) $startDate; $set['startDate'] = (int) $startDate;
} }
/* // CHECKSUM
* =====================
* CHECKSUM-FIX
* =====================
*/
try { try {
$checksumSeed = (string) ($data['checksum'] ?? ''); $checksumSeed = (string) ($data['checksum'] ?? '');
$checksumSeed .= '|' . ($set['keywords'] ?? ''); $checksumSeed .= '|' . ($set['keywords'] ?? '');
@@ -122,40 +105,51 @@ class IndexPageListener
/* /*
* ===================== * =====================
* PDF-INDEXIERUNG * DATEI-INDEXIERUNG (PDF / OFFICE)
* ===================== * =====================
*/ */
if ( if ((int) ($data['protected'] ?? 0) !== 0) {
(bool) Config::get('meilisearch_index_pdfs') return;
&& (int) ($data['protected'] ?? 0) === 0 }
) {
try { $indexPdfs = (bool) Config::get('meilisearch_index_pdfs');
$pdfLinks = $this->findPdfLinks($content); $indexOffice = (bool) Config::get('meilisearch_index_office_pdfs');
if ($pdfLinks !== []) {
$this->pdfIndexService->handlePdfLinks($pdfLinks); if (!$indexPdfs && !$indexOffice) {
} return;
} catch (\Throwable $e) { }
error_log('[ContaoMeilisearch] PDF indexing failed: ' . $e->getMessage());
$links = $this->findAllLinks($content);
$pdfLinks = [];
$officeLinks = [];
foreach ($links as $link) {
$type = $this->detectIndexableFileType($link['url']);
if ($type === 'pdf' && $indexPdfs) {
$pdfLinks[] = $link;
continue;
}
if (
in_array($type, ['docx', 'xlsx', 'pptx'], true)
&& $indexOffice
) {
$officeLinks[] = $link;
} }
} }
/* try {
* ===================== if ($pdfLinks !== []) {
* OFFICE-INDEXIERUNG $this->pdfIndexService->handlePdfLinks($pdfLinks);
* =====================
*/
if (
(bool) Config::get('meilisearch_index_office')
&& (int) ($data['protected'] ?? 0) === 0
) {
try {
$officeLinks = $this->findOfficeLinks($content);
if ($officeLinks !== []) {
$this->officeIndexService->handleOfficeLinks($officeLinks);
}
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Office indexing failed: ' . $e->getMessage());
} }
if ($officeLinks !== []) {
$this->officeIndexService->handleOfficeLinks($officeLinks);
}
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage());
} }
} }
@@ -171,61 +165,73 @@ class IndexPageListener
$json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1])); $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
$data = json_decode($json, true); $data = json_decode($json, true);
if (json_last_error() !== JSON_ERROR_NONE) { return json_last_error() === JSON_ERROR_NONE && is_array($data)
error_log('[ContaoMeilisearch] Invalid MEILISEARCH_JSON: ' . json_last_error_msg()); ? $data
: null;
}
/**
* Sammle alle <a href="…"> Links
*/
private function findAllLinks(string $content): array
{
if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
return [];
}
$result = [];
foreach ($matches[1] as $i => $href) {
$result[] = [
'url' => html_entity_decode($href),
'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
];
}
return $result;
}
/**
* Ermittelt indexierbaren Dateityp (pdf|docx|xlsx|pptx) oder null
*/
private function detectIndexableFileType(string $url): ?string
{
// Hash entfernen
$url = strtok($url, '#');
$parts = parse_url($url);
if (!$parts) {
return null; return null;
} }
return is_array($data) ? $data : null; // direkter Pfad (/files/…)
} if (!empty($parts['path'])) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
/** if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
* Findet PDF-Links im Content return $ext;
*/ }
private function findPdfLinks(string $content): array
{
if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
return [];
} }
$result = []; // Query-Parameter (Contao 4 + 5)
if (!empty($parts['query'])) {
parse_str($parts['query'], $query);
foreach ($matches[1] as $i => $href) { foreach (['file', 'p', 'f'] as $param) {
$result[] = [ if (!empty($query[$param])) {
'url' => html_entity_decode($href), $candidate = urldecode((string) $query[$param]);
'linkText' => trim(strip_tags($matches[2][$i])) ?: null, $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
];
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext;
}
}
}
} }
return $result; return null;
}
/**
* Findet Office-Links (docx, xlsx, pptx)
*/
private function findOfficeLinks(string $content): array
{
if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']*(?:\.(?:docx|xlsx|pptx)|p=(?:docx|xlsx|pptx)(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
return [];
}
$result = [];
foreach ($matches[1] as $i => $href) {
$result[] = [
'url' => html_entity_decode($href),
'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
];
}
return $result;
} }
} }