Tika Title encoding

This commit is contained in:
Jürgen Mummert
2026-01-10 19:06:58 +01:00
parent 2d3ddac945
commit 3ee7af7ce9
+41 -38
View File
@@ -73,8 +73,9 @@ class MeilisearchFilesParseCommand extends Command
foreach ($files as $file) { foreach ($files as $file) {
$originalUrl = (string) $file['url']; $originalUrl = (string) $file['url'];
$normalized = $originalUrl; $existingTitle = trim((string) ($file['title'] ?? ''));
$normalized = $originalUrl;
// ------------------------------------------------- // -------------------------------------------------
// Normalize URL // Normalize URL
@@ -173,56 +174,58 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// Tika METADATA (Title) // TITLE: keep existing editor-defined title
// ------------------------------------------------- // -------------------------------------------------
$title = null; $title = $existingTitle !== '' ? $existingTitle : null;
try { // -------------------------------------------------
$metaResponse = $client->request( // Tika METADATA (Title) only if no existing title
'PUT', // -------------------------------------------------
$tikaUrl . '/meta', if ($title === null) {
[ try {
'headers' => [ $metaResponse = $client->request(
'Accept' => 'application/json', 'PUT',
'Content-Type' => $mimeType, $tikaUrl . '/meta',
], [
'body' => fopen($absolutePath, 'rb'), 'headers' => [
] 'Accept' => 'application/json',
); 'Content-Type' => $mimeType,
],
$meta = json_decode($metaResponse->getContent(false), true); 'body' => fopen($absolutePath, 'rb'),
]
$rawTitle =
$meta['dc:title'][0]
?? $meta['pdf:docinfo:title'][0]
?? null;
if ($rawTitle) {
$title = html_entity_decode(
$rawTitle,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
); );
}
} catch (\Throwable) { $meta = json_decode($metaResponse->getContent(false), true);
// Metadata optional
$rawTitle =
$meta['dc:title'][0]
?? $meta['pdf:docinfo:title'][0]
?? null;
if ($rawTitle) {
$title = html_entity_decode(
$rawTitle,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
);
}
} catch (\Throwable) {
// Metadata optional
}
} }
// ------------------------------------------------- // -------------------------------------------------
// TITLE → ASCII SAFE (DELIBERATE DATA LOSS) // TITLE → ASCII SAFE (only if newly generated)
// ------------------------------------------------- // -------------------------------------------------
if ($title) { if ($existingTitle === '' && $title) {
// UTF-8 → ASCII, Unbekanntes verwerfen
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title); $title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
// Normalisieren
$title = preg_replace('/\s+/', ' ', $title); $title = preg_replace('/\s+/', ' ', $title);
$title = trim($title); $title = trim($title);
} }
// ------------------------------------------------- // -------------------------------------------------
// FALLBACK: Dateiname // FALLBACK: Dateiname (only if still empty)
// ------------------------------------------------- // -------------------------------------------------
if (!$title || strlen($title) < 5) { if (!$title || strlen($title) < 5) {
$title = pathinfo($normalized, PATHINFO_FILENAME); $title = pathinfo($normalized, PATHINFO_FILENAME);