Tika Title encoding
This commit is contained in:
@@ -73,8 +73,9 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
|
|
||||||
foreach ($files as $file) {
|
foreach ($files as $file) {
|
||||||
|
|
||||||
$originalUrl = (string) $file['url'];
|
$originalUrl = (string) $file['url'];
|
||||||
$normalized = $originalUrl;
|
$existingTitle = trim((string) ($file['title'] ?? ''));
|
||||||
|
$normalized = $originalUrl;
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// Normalize URL
|
// Normalize URL
|
||||||
@@ -173,56 +174,58 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// Tika METADATA (Title)
|
// TITLE: keep existing editor-defined title
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$title = null;
|
$title = $existingTitle !== '' ? $existingTitle : null;
|
||||||
|
|
||||||
try {
|
// -------------------------------------------------
|
||||||
$metaResponse = $client->request(
|
// Tika METADATA (Title) – only if no existing title
|
||||||
'PUT',
|
// -------------------------------------------------
|
||||||
$tikaUrl . '/meta',
|
if ($title === null) {
|
||||||
[
|
try {
|
||||||
'headers' => [
|
$metaResponse = $client->request(
|
||||||
'Accept' => 'application/json',
|
'PUT',
|
||||||
'Content-Type' => $mimeType,
|
$tikaUrl . '/meta',
|
||||||
],
|
[
|
||||||
'body' => fopen($absolutePath, 'rb'),
|
'headers' => [
|
||||||
]
|
'Accept' => 'application/json',
|
||||||
);
|
'Content-Type' => $mimeType,
|
||||||
|
],
|
||||||
$meta = json_decode($metaResponse->getContent(false), true);
|
'body' => fopen($absolutePath, 'rb'),
|
||||||
|
]
|
||||||
$rawTitle =
|
|
||||||
$meta['dc:title'][0]
|
|
||||||
?? $meta['pdf:docinfo:title'][0]
|
|
||||||
?? null;
|
|
||||||
|
|
||||||
if ($rawTitle) {
|
|
||||||
$title = html_entity_decode(
|
|
||||||
$rawTitle,
|
|
||||||
ENT_QUOTES | ENT_HTML5,
|
|
||||||
'UTF-8'
|
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
} catch (\Throwable) {
|
$meta = json_decode($metaResponse->getContent(false), true);
|
||||||
// Metadata optional
|
|
||||||
|
$rawTitle =
|
||||||
|
$meta['dc:title'][0]
|
||||||
|
?? $meta['pdf:docinfo:title'][0]
|
||||||
|
?? null;
|
||||||
|
|
||||||
|
if ($rawTitle) {
|
||||||
|
$title = html_entity_decode(
|
||||||
|
$rawTitle,
|
||||||
|
ENT_QUOTES | ENT_HTML5,
|
||||||
|
'UTF-8'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (\Throwable) {
|
||||||
|
// Metadata optional
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// TITLE → ASCII SAFE (DELIBERATE DATA LOSS)
|
// TITLE → ASCII SAFE (only if newly generated)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if ($title) {
|
if ($existingTitle === '' && $title) {
|
||||||
// UTF-8 → ASCII, Unbekanntes verwerfen
|
|
||||||
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
|
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
|
||||||
|
|
||||||
// Normalisieren
|
|
||||||
$title = preg_replace('/\s+/', ' ', $title);
|
$title = preg_replace('/\s+/', ' ', $title);
|
||||||
$title = trim($title);
|
$title = trim($title);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// FALLBACK: Dateiname
|
// FALLBACK: Dateiname (only if still empty)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if (!$title || strlen($title) < 5) {
|
if (!$title || strlen($title) < 5) {
|
||||||
$title = pathinfo($normalized, PATHINFO_FILENAME);
|
$title = pathinfo($normalized, PATHINFO_FILENAME);
|
||||||
|
|||||||
Reference in New Issue
Block a user