Tika Title encoding

This commit is contained in:
Jürgen Mummert
2026-01-10 12:05:15 +01:00
parent 2bd52f77e0
commit 38372539c2
+63 -31
View File
@@ -22,18 +22,18 @@ class MeilisearchFilesParseCommand extends Command
{ {
$this $this
->setName('meilisearch:files:parse') ->setName('meilisearch:files:parse')
->setDescription('Parse indexed files via Apache Tika and store extracted text') ->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files')
->addOption( ->addOption(
'limit', 'limit',
null, null,
InputOption::VALUE_OPTIONAL, InputOption::VALUE_OPTIONAL,
'Maximum number of files to check per run (optional)' 'Maximum number of files to check per run'
) )
->addOption( ->addOption(
'dry-run', 'dry-run',
null, null,
InputOption::VALUE_NONE, InputOption::VALUE_NONE,
'Do not send files to Tika, just show what would be parsed' 'Do not send files to Tika'
); );
} }
@@ -44,11 +44,9 @@ class MeilisearchFilesParseCommand extends Command
$dryRun = (bool) $input->getOption('dry-run'); $dryRun = (bool) $input->getOption('dry-run');
// ---- LIMIT: nur wenn explizit gesetzt
$limitOption = $input->getOption('limit'); $limitOption = $input->getOption('limit');
$limit = $limitOption !== null ? max(1, (int) $limitOption) : null; $limit = $limitOption !== null ? max(1, (int) $limitOption) : null;
// ---- Tika URL
$tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/'); $tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/');
if ($tikaUrl === '') { if ($tikaUrl === '') {
$output->writeln('<error>Tika URL not configured</error>'); $output->writeln('<error>Tika URL not configured</error>');
@@ -57,7 +55,6 @@ class MeilisearchFilesParseCommand extends Command
$db = Database::getInstance(); $db = Database::getInstance();
// ---- Files laden
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
if ($limit !== null) { if ($limit !== null) {
$sql .= " LIMIT " . (int) $limit; $sql .= " LIMIT " . (int) $limit;
@@ -80,7 +77,7 @@ class MeilisearchFilesParseCommand extends Command
$normalized = $originalUrl; $normalized = $originalUrl;
// ------------------------------------------------- // -------------------------------------------------
// 1) ?file=files/… // Normalize URL → files/…
// ------------------------------------------------- // -------------------------------------------------
if (str_contains($normalized, '?')) { if (str_contains($normalized, '?')) {
$parts = parse_url($normalized); $parts = parse_url($normalized);
@@ -95,20 +92,10 @@ class MeilisearchFilesParseCommand extends Command
} }
} }
// -------------------------------------------------
// 2) Fragment entfernen
// -------------------------------------------------
$normalized = strtok($normalized, '#'); $normalized = strtok($normalized, '#');
// -------------------------------------------------
// 3) URL-Decoding
// -------------------------------------------------
$normalized = rawurldecode($normalized); $normalized = rawurldecode($normalized);
// -------------------------------------------------
// 4) Nur lokale files/
// -------------------------------------------------
$normalized = ltrim($normalized, '/'); $normalized = ltrim($normalized, '/');
if (!str_starts_with($normalized, 'files/')) { if (!str_starts_with($normalized, 'files/')) {
$this->log('Not in files/, skip', ['url' => $originalUrl]); $this->log('Not in files/, skip', ['url' => $originalUrl]);
continue; continue;
@@ -128,7 +115,7 @@ class MeilisearchFilesParseCommand extends Command
$checksum = md5($normalized . '|' . $mtime); $checksum = md5($normalized . '|' . $mtime);
// ------------------------------------------------- // -------------------------------------------------
// 5) Skip unchanged // Skip unchanged
// ------------------------------------------------- // -------------------------------------------------
if ($file['checksum'] === $checksum && !empty($file['text'])) { if ($file['checksum'] === $checksum && !empty($file['text'])) {
continue; continue;
@@ -140,7 +127,7 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// 6) MIME-Type // MIME-Type
// ------------------------------------------------- // -------------------------------------------------
$ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION));
@@ -158,12 +145,12 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// 7) Tika parse // Tika BODY
// ------------------------------------------------- // -------------------------------------------------
try { try {
$this->log('Parsing file', ['url' => $normalized]); $this->log('Parsing file', ['url' => $normalized]);
$response = $client->request( $bodyResponse = $client->request(
'PUT', 'PUT',
$tikaUrl . '/tika/main', $tikaUrl . '/tika/main',
[ [
@@ -175,14 +162,65 @@ class MeilisearchFilesParseCommand extends Command
] ]
); );
$text = trim((string) $response->getContent(false)); $text = trim((string) $bodyResponse->getContent(false));
} catch (\Throwable $e) {
$this->log('Body parse failed', [
'url' => $normalized,
'error' => $e->getMessage(),
]);
continue;
}
// -------------------------------------------------
// Tika METADATA (Titel)
// -------------------------------------------------
$title = null;
try {
$metaResponse = $client->request(
'PUT',
$tikaUrl . '/meta',
[
'headers' => [
'Accept' => 'application/json',
'Content-Type' => $mimeType,
],
'body' => fopen($absolutePath, 'rb'),
]
);
$meta = json_decode($metaResponse->getContent(false), true);
$rawTitle =
$meta['dc:title'][0]
?? $meta['pdf:docinfo:title'][0]
?? null;
if ($rawTitle) {
$title = trim(
html_entity_decode(
$rawTitle,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
)
);
}
} catch (\Throwable) {
// Titel ist optional
}
// -------------------------------------------------
// Store result
// -------------------------------------------------
$db->prepare( $db->prepare(
"UPDATE tl_search_files "UPDATE tl_search_files
SET text = ?, checksum = ?, file_mtime = ?, tstamp = ? SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ?
WHERE id = ?" WHERE id = ?"
)->execute( )->execute(
$text, $text,
$title,
$checksum, $checksum,
$mtime, $mtime,
time(), time(),
@@ -192,14 +230,8 @@ class MeilisearchFilesParseCommand extends Command
$this->log('File parsed', [ $this->log('File parsed', [
'url' => $normalized, 'url' => $normalized,
'chars' => mb_strlen($text), 'chars' => mb_strlen($text),
'title' => $title,
]); ]);
} catch (\Throwable $e) {
$this->log('Parse failed', [
'url' => $normalized,
'error' => $e->getMessage(),
]);
}
} }
$this->log('Parser finished'); $this->log('Parser finished');