Tika Title encoding

This commit is contained in:
Jürgen Mummert
2026-01-10 18:31:00 +01:00
parent c085911877
commit 17da2a8434
+12 -11
View File
@@ -55,8 +55,6 @@ class MeilisearchFilesParseCommand extends Command
$db = Database::getInstance(); $db = Database::getInstance();
$db->query("SET NAMES utf8mb4 COLLATE utf8mb4_unicode_ci");
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
if ($limit !== null) { if ($limit !== null) {
$sql .= " LIMIT " . (int) $limit; $sql .= " LIMIT " . (int) $limit;
@@ -200,12 +198,10 @@ class MeilisearchFilesParseCommand extends Command
?? null; ?? null;
if ($rawTitle) { if ($rawTitle) {
$title = trim( $title = html_entity_decode(
html_entity_decode( $rawTitle,
$rawTitle, ENT_QUOTES | ENT_HTML5,
ENT_QUOTES | ENT_HTML5, 'UTF-8'
'UTF-8'
)
); );
} }
@@ -214,15 +210,20 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// TITLE FALLBACK (REQUIRED) // TITLE FALLBACK
// ------------------------------------------------- // -------------------------------------------------
if (!$title) { if (!$title) {
$title = pathinfo($normalized, PATHINFO_FILENAME); $title = pathinfo($normalized, PATHINFO_FILENAME);
$title = str_replace(['_', '-'], ' ', $title); $title = str_replace(['_', '-'], ' ', $title);
$title = preg_replace('/\s+/u', ' ', $title);
$title = trim($title);
} }
// -------------------------------------------------
// 🔑 CRITICAL FIX: remove invalid UTF-8 bytes
// -------------------------------------------------
$title = iconv('UTF-8', 'UTF-8//IGNORE', $title);
$title = preg_replace('/\s+/u', ' ', $title);
$title = trim($title);
// ------------------------------------------------- // -------------------------------------------------
// Store result // Store result
// ------------------------------------------------- // -------------------------------------------------