Tika Title encoding

This commit is contained in:
Jürgen Mummert
2026-01-11 18:49:15 +01:00
parent 838f574574
commit 6e41df002e
+87 -27
View File
@@ -3,6 +3,7 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener; namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use Contao\Config; use Contao\Config;
use Contao\System;
use Doctrine\DBAL\Connection; use Doctrine\DBAL\Connection;
class IndexPageListener class IndexPageListener
@@ -14,6 +15,8 @@ class IndexPageListener
private function debug(string $message, array $context = []): void private function debug(string $message, array $context = []): void
{ {
// Debug bewusst immer aktiv (bis du es wieder entfernst)
// Kontext kurz halten, damit Logs nicht explodieren
$ctx = $context $ctx = $context
? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE)
: ''; : '';
@@ -25,6 +28,9 @@ class IndexPageListener
{ {
$this->debug('Hook start', [ $this->debug('Hook start', [
'url' => $data['url'] ?? null, 'url' => $data['url'] ?? null,
'protected' => $data['protected'] ?? null,
'checksum' => $data['checksum'] ?? null,
'set_keys' => array_keys($set),
]); ]);
/* /*
@@ -32,21 +38,79 @@ class IndexPageListener
* SEITEN-METADATEN * SEITEN-METADATEN
* ===================== * =====================
*/ */
if (str_contains($content, 'MEILISEARCH_JSON')) { $hasMeta = str_contains($content, 'MEILISEARCH_JSON');
$this->debug('Meta marker scan', [
'contains_MEILISEARCH_JSON' => $hasMeta,
'content_length' => strlen($content),
]);
if ($hasMeta) {
try {
$parsed = $this->extractMeilisearchJson($content); $parsed = $this->extractMeilisearchJson($content);
$this->debug('extractMeilisearchJson(): done', [
'parsed_is_array' => is_array($parsed),
'parsed_keys' => is_array($parsed) ? array_keys($parsed) : null,
]);
} catch (\Throwable $e) {
$this->debug('Failed to extract MEILISEARCH_JSON', [
'error' => $e->getMessage(),
'class' => $e::class,
]);
$parsed = null;
}
if (is_array($parsed)) { if (is_array($parsed)) {
if (isset($parsed['page']['priority'])) {
$set['priority'] = (int) $parsed['page']['priority']; // PRIORITY
$priority =
$parsed['event']['priority']
?? $parsed['news']['priority']
?? $parsed['page']['priority']
?? null;
if ($priority !== null && $priority !== '') {
$set['priority'] = (int) $priority;
} }
// KEYWORDS
$keywords = [];
foreach ([
$parsed['event']['keywords'] ?? null,
$parsed['news']['keywords'] ?? null,
$parsed['page']['keywords'] ?? null,
] as $src) {
if (is_string($src) && trim($src) !== '') {
foreach (preg_split('/\s+/', trim($src)) as $word) {
$keywords[] = $word;
}
}
}
if ($keywords) {
$set['keywords'] = implode(' ', array_unique($keywords));
}
// IMAGEPATH
if (!empty($parsed['page']['searchimage'])) { if (!empty($parsed['page']['searchimage'])) {
$set['imagepath'] = (string) $parsed['page']['searchimage']; $set['imagepath'] = trim((string) $parsed['page']['searchimage']);
} }
// STARTDATE
$startDate =
$parsed['event']['startDate']
?? $parsed['news']['startDate']
?? null;
if (is_numeric($startDate) && (int) $startDate > 0) {
$set['startDate'] = (int) $startDate;
}
// CHECKSUM
$checksumSeed = (string) ($data['checksum'] ?? ''); $checksumSeed = (string) ($data['checksum'] ?? '');
$checksumSeed .= '|' . ($set['keywords'] ?? '');
$checksumSeed .= '|' . ($set['priority'] ?? ''); $checksumSeed .= '|' . ($set['priority'] ?? '');
$checksumSeed .= '|' . ($set['imagepath'] ?? ''); $checksumSeed .= '|' . ($set['imagepath'] ?? '');
$checksumSeed .= '|' . ($set['startDate'] ?? '');
$set['checksum'] = md5($checksumSeed); $set['checksum'] = md5($checksumSeed);
} }
@@ -54,7 +118,7 @@ class IndexPageListener
/* /*
* ===================== * =====================
* DATEI-ERKENNUNG * DATEI-ERKENNUNG + UPSERT
* ===================== * =====================
*/ */
if ((int) ($data['protected'] ?? 0) !== 0) { if ((int) ($data['protected'] ?? 0) !== 0) {
@@ -83,30 +147,21 @@ class IndexPageListener
$time = time(); $time = time();
foreach ($fileLinks as $file) { foreach ($fileLinks as $file) {
$url = strtok($file['url'], '#'); $url = strtok($file['url'], '#');
/* /*
* ===================== * =====================
* 🔥 DOWNLOAD-URL-NORMALISIERUNG (FIX) * URL-NORMALISIERUNG (NEU)
* ===================== * =====================
*/ */
if (str_contains($url, 'p=')) { if (str_contains($url, 'p=')) {
parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $q); parse_str(parse_url($url, PHP_URL_QUERY) ?? '', $query);
if (!empty($query['p'])) {
if (!empty($q['p'])) { $url = '/' . ltrim(rawurldecode($query['p']), '/');
$url = '/' . ltrim(rawurldecode($q['p']), '/'); $this->debug('Normalized download URL', ['url' => $url]);
$this->debug('Normalized download URL', [
'normalized' => $url,
]);
} }
} }
if (!is_string($url) || $url === '') {
continue;
}
$path = parse_url($url, PHP_URL_PATH); $path = parse_url($url, PHP_URL_PATH);
$abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null; $abs = $path ? TL_ROOT . '/' . ltrim($path, '/') : null;
@@ -115,11 +170,11 @@ class IndexPageListener
try { try {
$existing = $db->fetchAssociative( $existing = $db->fetchAssociative(
'SELECT id FROM tl_search_files WHERE url = ?', 'SELECT id, checksum FROM tl_search_files WHERE url = ?',
[$url] [$url]
); );
} catch (\Throwable $e) { } catch (\Throwable $e) {
$this->debug('DB error (file indexing)', [ $this->debug('DB error', [
'url' => $url, 'url' => $url,
'error' => $e->getMessage(), 'error' => $e->getMessage(),
]); ]);
@@ -155,10 +210,12 @@ class IndexPageListener
} }
} }
$this->debug('Hook end'); $this->debug('Hook end', [
'final_set_keys' => array_keys($set),
]);
} }
/* === Hilfsmethoden === */ /* === Hilfsmethoden unverändert === */
private function extractMeilisearchJson(string $content): ?array private function extractMeilisearchJson(string $content): ?array
{ {
@@ -166,9 +223,12 @@ class IndexPageListener
return null; return null;
} }
$data = json_decode(trim($m[1]), true); $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
$data = json_decode($json, true);
return json_last_error() === JSON_ERROR_NONE ? $data : null; return json_last_error() === JSON_ERROR_NONE && is_array($data)
? $data
: null;
} }
private function findAllLinks(string $content): array private function findAllLinks(string $content): array
@@ -207,10 +267,10 @@ class IndexPageListener
if (!empty($parts['query'])) { if (!empty($parts['query'])) {
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
foreach (['file', 'p', 'f'] as $param) { foreach (['file', 'p', 'f'] as $param) {
if (!empty($query[$param])) { if (!empty($query[$param])) {
$ext = strtolower(pathinfo($query[$param], PATHINFO_EXTENSION)); $candidate = rawurldecode(html_entity_decode((string) $query[$param], ENT_QUOTES));
$ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext; return $ext;
} }