8 Commits

Author SHA1 Message Date
Jürgen Mummert 86b81affdc Tika Title encoding 2026-01-10 19:06:58 +01:00
Jürgen Mummert 2d3ddac945 Tika Title encoding 2026-01-10 18:57:13 +01:00
Jürgen Mummert 17da2a8434 Tika Title encoding 2026-01-10 18:31:00 +01:00
Jürgen Mummert c085911877 Tika Title encoding 2026-01-10 18:26:00 +01:00
Jürgen Mummert 40792870bd Tika Title encoding 2026-01-10 12:30:20 +01:00
Jürgen Mummert 38372539c2 Tika Title encoding 2026-01-10 12:05:15 +01:00
Jürgen Mummert 2bd52f77e0 new Twig 2026-01-09 22:04:52 +01:00
Jürgen Mummert 99ef883da5 new Twig 2026-01-09 22:01:16 +01:00
2 changed files with 100 additions and 43 deletions
+98 -43
View File
@@ -22,18 +22,18 @@ class MeilisearchFilesParseCommand extends Command
{ {
$this $this
->setName('meilisearch:files:parse') ->setName('meilisearch:files:parse')
->setDescription('Parse indexed files via Apache Tika and store extracted text') ->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files')
->addOption( ->addOption(
'limit', 'limit',
null, null,
InputOption::VALUE_OPTIONAL, InputOption::VALUE_OPTIONAL,
'Maximum number of files to check per run (optional)' 'Maximum number of files to check per run'
) )
->addOption( ->addOption(
'dry-run', 'dry-run',
null, null,
InputOption::VALUE_NONE, InputOption::VALUE_NONE,
'Do not send files to Tika, just show what would be parsed' 'Do not send files to Tika'
); );
} }
@@ -44,11 +44,9 @@ class MeilisearchFilesParseCommand extends Command
$dryRun = (bool) $input->getOption('dry-run'); $dryRun = (bool) $input->getOption('dry-run');
// ---- LIMIT: nur wenn explizit gesetzt
$limitOption = $input->getOption('limit'); $limitOption = $input->getOption('limit');
$limit = $limitOption !== null ? max(1, (int) $limitOption) : null; $limit = $limitOption !== null ? max(1, (int) $limitOption) : null;
// ---- Tika URL
$tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/'); $tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/');
if ($tikaUrl === '') { if ($tikaUrl === '') {
$output->writeln('<error>Tika URL not configured</error>'); $output->writeln('<error>Tika URL not configured</error>');
@@ -57,7 +55,6 @@ class MeilisearchFilesParseCommand extends Command
$db = Database::getInstance(); $db = Database::getInstance();
// ---- Files laden
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
if ($limit !== null) { if ($limit !== null) {
$sql .= " LIMIT " . (int) $limit; $sql .= " LIMIT " . (int) $limit;
@@ -76,11 +73,12 @@ class MeilisearchFilesParseCommand extends Command
foreach ($files as $file) { foreach ($files as $file) {
$originalUrl = (string) $file['url']; $originalUrl = (string) $file['url'];
$normalized = $originalUrl; $existingTitle = trim((string) ($file['title'] ?? ''));
$normalized = $originalUrl;
// ------------------------------------------------- // -------------------------------------------------
// 1) ?file=files/… // Normalize URL
// ------------------------------------------------- // -------------------------------------------------
if (str_contains($normalized, '?')) { if (str_contains($normalized, '?')) {
$parts = parse_url($normalized); $parts = parse_url($normalized);
@@ -95,20 +93,10 @@ class MeilisearchFilesParseCommand extends Command
} }
} }
// -------------------------------------------------
// 2) Fragment entfernen
// -------------------------------------------------
$normalized = strtok($normalized, '#'); $normalized = strtok($normalized, '#');
// -------------------------------------------------
// 3) URL-Decoding
// -------------------------------------------------
$normalized = rawurldecode($normalized); $normalized = rawurldecode($normalized);
// -------------------------------------------------
// 4) Nur lokale files/
// -------------------------------------------------
$normalized = ltrim($normalized, '/'); $normalized = ltrim($normalized, '/');
if (!str_starts_with($normalized, 'files/')) { if (!str_starts_with($normalized, 'files/')) {
$this->log('Not in files/, skip', ['url' => $originalUrl]); $this->log('Not in files/, skip', ['url' => $originalUrl]);
continue; continue;
@@ -128,7 +116,7 @@ class MeilisearchFilesParseCommand extends Command
$checksum = md5($normalized . '|' . $mtime); $checksum = md5($normalized . '|' . $mtime);
// ------------------------------------------------- // -------------------------------------------------
// 5) Skip unchanged // Skip unchanged
// ------------------------------------------------- // -------------------------------------------------
if ($file['checksum'] === $checksum && !empty($file['text'])) { if ($file['checksum'] === $checksum && !empty($file['text'])) {
continue; continue;
@@ -140,7 +128,7 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// 6) MIME-Type // MIME-Type
// ------------------------------------------------- // -------------------------------------------------
$ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION));
@@ -158,12 +146,12 @@ class MeilisearchFilesParseCommand extends Command
} }
// ------------------------------------------------- // -------------------------------------------------
// 7) Tika parse // Tika BODY (roher Plaintext)
// ------------------------------------------------- // -------------------------------------------------
try { try {
$this->log('Parsing file', ['url' => $normalized]); $this->log('Parsing file', ['url' => $normalized]);
$response = $client->request( $bodyResponse = $client->request(
'PUT', 'PUT',
$tikaUrl . '/tika/main', $tikaUrl . '/tika/main',
[ [
@@ -175,31 +163,98 @@ class MeilisearchFilesParseCommand extends Command
] ]
); );
$text = trim((string) $response->getContent(false)); $text = trim((string) $bodyResponse->getContent(false));
$db->prepare(
"UPDATE tl_search_files
SET text = ?, checksum = ?, file_mtime = ?, tstamp = ?
WHERE id = ?"
)->execute(
$text,
$checksum,
$mtime,
time(),
$file['id']
);
$this->log('File parsed', [
'url' => $normalized,
'chars' => mb_strlen($text),
]);
} catch (\Throwable $e) { } catch (\Throwable $e) {
$this->log('Parse failed', [ $this->log('Body parse failed', [
'url' => $normalized, 'url' => $normalized,
'error' => $e->getMessage(), 'error' => $e->getMessage(),
]); ]);
continue;
} }
// -------------------------------------------------
// TITLE: keep existing editor-defined title
// -------------------------------------------------
$title = $existingTitle !== '' ? $existingTitle : null;
// -------------------------------------------------
// Tika METADATA (Title) only if no existing title
// -------------------------------------------------
if ($title === null) {
try {
$metaResponse = $client->request(
'PUT',
$tikaUrl . '/meta',
[
'headers' => [
'Accept' => 'application/json',
'Content-Type' => $mimeType,
],
'body' => fopen($absolutePath, 'rb'),
]
);
$meta = json_decode($metaResponse->getContent(false), true);
$rawTitle =
$meta['dc:title'][0]
?? $meta['pdf:docinfo:title'][0]
?? null;
if ($rawTitle) {
$title = html_entity_decode(
$rawTitle,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
);
}
} catch (\Throwable) {
// Metadata optional
}
}
// -------------------------------------------------
// TITLE → ASCII SAFE (only if newly generated)
// -------------------------------------------------
if ($existingTitle === '' && $title) {
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
$title = preg_replace('/\s+/', ' ', $title);
$title = trim($title);
}
// -------------------------------------------------
// FALLBACK: Dateiname (only if still empty)
// -------------------------------------------------
if (!$title || strlen($title) < 5) {
$title = pathinfo($normalized, PATHINFO_FILENAME);
$title = str_replace(['_', '-'], ' ', $title);
$title = preg_replace('/\s+/', ' ', $title);
$title = trim($title);
}
// -------------------------------------------------
// Store result
// -------------------------------------------------
$db->prepare(
"UPDATE tl_search_files
SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ?
WHERE id = ?"
)->execute(
$text,
$title,
$checksum,
$mtime,
time(),
$file['id']
);
$this->log('File parsed', [
'url' => $normalized,
'chars' => mb_strlen($text),
'title' => $title,
]);
} }
$this->log('Parser finished'); $this->log('Parser finished');
@@ -4,6 +4,7 @@ Contao 5 Frontend Module Template
#} #}
<!-- indexer::stop --> <!-- indexer::stop -->
{% block meilisearch %}
<div <div
id="topsearch" id="topsearch"
class="meilisearch-search" class="meilisearch-search"
@@ -218,4 +219,5 @@ Contao 5 Frontend Module Template
} }
}); });
</script> </script>
{% endblock %}
<!-- indexer::continue --> <!-- indexer::continue -->