Tika Title encoding

This commit is contained in:
Jürgen Mummert
2026-01-11 18:51:38 +01:00
parent 6e41df002e
commit 3427f6b60b
+73 -42
View File
@@ -4,23 +4,18 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use Contao\Config; use Contao\Config;
use Contao\System; use Contao\System;
use Doctrine\DBAL\Connection;
class IndexPageListener class IndexPageListener
{ {
public function __construct( public function __construct()
private readonly Connection $connection, {
) {
} }
private function debug(string $message, array $context = []): void private function debug(string $message, array $context = []): void
{ {
// Debug bewusst immer aktiv (bis du es wieder entfernst) // Debug bewusst immer aktiv (bis du es wieder entfernst)
// Kontext kurz halten, damit Logs nicht explodieren // Kontext kurz halten, damit Logs nicht explodieren
$ctx = $context $ctx = $context ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) : '';
? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE)
: '';
error_log('[ContaoMeilisearch][IndexPageListener] ' . $message . $ctx); error_log('[ContaoMeilisearch][IndexPageListener] ' . $message . $ctx);
} }
@@ -68,31 +63,45 @@ class IndexPageListener
?? $parsed['page']['priority'] ?? $parsed['page']['priority']
?? null; ?? null;
$this->debug('Meta: priority candidate', ['priority' => $priority]);
if ($priority !== null && $priority !== '') { if ($priority !== null && $priority !== '') {
$set['priority'] = (int) $priority; $set['priority'] = (int) $priority;
} }
// KEYWORDS // KEYWORDS
$keywords = []; $keywordSources = [
foreach ([
$parsed['event']['keywords'] ?? null, $parsed['event']['keywords'] ?? null,
$parsed['news']['keywords'] ?? null, $parsed['news']['keywords'] ?? null,
$parsed['page']['keywords'] ?? null, $parsed['page']['keywords'] ?? null,
] as $src) { ];
if (is_string($src) && trim($src) !== '') {
$this->debug('Meta: keyword sources', ['sources' => $keywordSources]);
$keywords = [];
foreach ($keywordSources as $src) {
if (!is_string($src) || trim($src) === '') {
continue;
}
foreach (preg_split('/\s+/', trim($src)) as $word) { foreach (preg_split('/\s+/', trim($src)) as $word) {
$keywords[] = $word; $keywords[] = $word;
} }
} }
}
if ($keywords) { if ($keywords) {
$set['keywords'] = implode(' ', array_unique($keywords)); $set['keywords'] = implode(' ', array_unique($keywords));
} }
// IMAGEPATH $this->debug('Meta: keywords result', [
if (!empty($parsed['page']['searchimage'])) { 'keywords' => $set['keywords'] ?? null,
$set['imagepath'] = trim((string) $parsed['page']['searchimage']); ]);
// IMAGEPATH (UUID)
$searchImage = $parsed['page']['searchimage'] ?? null;
$this->debug('Meta: searchimage candidate', ['searchimage' => $searchImage]);
if (!empty($searchImage)) {
$set['imagepath'] = trim((string) $searchImage);
} }
// STARTDATE // STARTDATE
@@ -101,11 +110,14 @@ class IndexPageListener
?? $parsed['news']['startDate'] ?? $parsed['news']['startDate']
?? null; ?? null;
$this->debug('Meta: startDate candidate', ['startDate' => $startDate]);
if (is_numeric($startDate) && (int) $startDate > 0) { if (is_numeric($startDate) && (int) $startDate > 0) {
$set['startDate'] = (int) $startDate; $set['startDate'] = (int) $startDate;
} }
// CHECKSUM // CHECKSUM
try {
$checksumSeed = (string) ($data['checksum'] ?? ''); $checksumSeed = (string) ($data['checksum'] ?? '');
$checksumSeed .= '|' . ($set['keywords'] ?? ''); $checksumSeed .= '|' . ($set['keywords'] ?? '');
$checksumSeed .= '|' . ($set['priority'] ?? ''); $checksumSeed .= '|' . ($set['priority'] ?? '');
@@ -113,6 +125,17 @@ class IndexPageListener
$checksumSeed .= '|' . ($set['startDate'] ?? ''); $checksumSeed .= '|' . ($set['startDate'] ?? '');
$set['checksum'] = md5($checksumSeed); $set['checksum'] = md5($checksumSeed);
$this->debug('Checksum generated', [
'seed_preview' => substr($checksumSeed, 0, 120) . (strlen($checksumSeed) > 120 ? '…' : ''),
'checksum' => $set['checksum'],
]);
} catch (\Throwable $e) {
$this->debug('Failed to generate checksum', [
'error' => $e->getMessage(),
'class' => $e::class,
]);
}
} }
} }
@@ -122,14 +145,24 @@ class IndexPageListener
* ===================== * =====================
*/ */
if ((int) ($data['protected'] ?? 0) !== 0) { if ((int) ($data['protected'] ?? 0) !== 0) {
$this->debug('Abort: protected page', ['protected' => $data['protected'] ?? null]);
return; return;
} }
if (!Config::get('meilisearch_index_files')) { $indexFiles = (bool) Config::get('meilisearch_index_files');
$this->debug('File indexing setting', [
'meilisearch_index_files' => $indexFiles,
]);
if (!$indexFiles) {
$this->debug('Abort: file indexing disabled');
return; return;
} }
$links = $this->findAllLinks($content); $links = $this->findAllLinks($content);
$this->debug('Links found', ['count' => count($links)]);
$fileLinks = []; $fileLinks = [];
foreach ($links as $link) { foreach ($links as $link) {
@@ -139,47 +172,28 @@ class IndexPageListener
} }
} }
if (!$fileLinks) { $this->debug('Indexable file links found', [
return; 'count' => count($fileLinks),
} 'types' => array_count_values(array_column($fileLinks, 'type')),
]);
$db = $this->connection; if ($fileLinks) {
$db = System::getContainer()->get('database_connection');
$time = time(); $time = time();
foreach ($fileLinks as $file) { foreach ($fileLinks as $file) {
$url = strtok($file['url'], '#'); $url = strtok($file['url'], '#');
/*
* =====================
* URL-NORMALISIERUNG (NEU)
* =====================
*/
if (str_contains($url, 'p=')) {
parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $query);
if (!empty($query['p'])) {
$url = '/' . ltrim(rawurldecode($query['p']), '/');
$this->debug('Normalized download URL', ['url' => $url]);
}
}
$path = parse_url($url, PHP_URL_PATH); $path = parse_url($url, PHP_URL_PATH);
$abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null; $abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null;
$mtime = ($abs && is_file($abs)) ? filemtime($abs) : 0; $mtime = ($abs && is_file($abs)) ? filemtime($abs) : 0;
$checksum = md5($url . '|' . $mtime); $checksum = md5($url . '|' . $mtime);
try {
$existing = $db->fetchAssociative( $existing = $db->fetchAssociative(
'SELECT id, checksum FROM tl_search_files WHERE url = ?', 'SELECT id, checksum FROM tl_search_files WHERE url = ?',
[$url] [$url]
); );
} catch (\Throwable $e) {
$this->debug('DB error', [
'url' => $url,
'error' => $e->getMessage(),
]);
continue;
}
if ($existing) { if ($existing) {
$db->update( $db->update(
@@ -193,6 +207,11 @@ class IndexPageListener
], ],
['id' => $existing['id']] ['id' => $existing['id']]
); );
$this->debug('File updated', [
'url' => $url,
'checksum' => $checksum,
]);
} else { } else {
$db->insert( $db->insert(
'tl_search_files', 'tl_search_files',
@@ -207,6 +226,12 @@ class IndexPageListener
'checksum' => $checksum, 'checksum' => $checksum,
] ]
); );
$this->debug('File inserted', [
'url' => $url,
'checksum' => $checksum,
]);
}
} }
} }
@@ -256,7 +281,11 @@ class IndexPageListener
private function detectIndexableFileType(string $url): ?string private function detectIndexableFileType(string $url): ?string
{ {
$url = strtok($url, '#'); $url = strtok($url, '#');
$parts = parse_url($url); $parts = parse_url($url);
if (!$parts) {
return null;
}
if (!empty($parts['path'])) { if (!empty($parts['path'])) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
@@ -267,10 +296,12 @@ class IndexPageListener
if (!empty($parts['query'])) { if (!empty($parts['query'])) {
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
foreach (['file', 'p', 'f'] as $param) { foreach (['file', 'p', 'f'] as $param) {
if (!empty($query[$param])) { if (!empty($query[$param])) {
$candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES)); $candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES));
$ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext; return $ext;
} }