8 Commits

Author SHA1 Message Date
Jürgen Mummert 86b81affdc Tika Title encoding 2026-01-10 19:06:58 +01:00
Jürgen Mummert 2d3ddac945 Tika Title encoding 2026-01-10 18:57:13 +01:00
Jürgen Mummert 17da2a8434 Tika Title encoding 2026-01-10 18:31:00 +01:00
Jürgen Mummert c085911877 Tika Title encoding 2026-01-10 18:26:00 +01:00
Jürgen Mummert 40792870bd Tika Title encoding 2026-01-10 12:30:20 +01:00
Jürgen Mummert 38372539c2 Tika Title encoding 2026-01-10 12:05:15 +01:00
Jürgen Mummert 2bd52f77e0 new Twig 2026-01-09 22:04:52 +01:00
Jürgen Mummert 99ef883da5 new Twig 2026-01-09 22:01:16 +01:00
2 changed files with 100 additions and 43 deletions
+98 -43
View File
@@ -22,18 +22,18 @@ class MeilisearchFilesParseCommand extends Command
{
$this
->setName('meilisearch:files:parse')
->setDescription('Parse indexed files via Apache Tika and store extracted text')
->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files')
->addOption(
'limit',
null,
InputOption::VALUE_OPTIONAL,
'Maximum number of files to check per run (optional)'
'Maximum number of files to check per run'
)
->addOption(
'dry-run',
null,
InputOption::VALUE_NONE,
'Do not send files to Tika, just show what would be parsed'
'Do not send files to Tika'
);
}
@@ -44,11 +44,9 @@ class MeilisearchFilesParseCommand extends Command
$dryRun = (bool) $input->getOption('dry-run');
// ---- LIMIT: nur wenn explizit gesetzt
$limitOption = $input->getOption('limit');
$limit = $limitOption !== null ? max(1, (int) $limitOption) : null;
// ---- Tika URL
$tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/');
if ($tikaUrl === '') {
$output->writeln('<error>Tika URL not configured</error>');
@@ -57,7 +55,6 @@ class MeilisearchFilesParseCommand extends Command
$db = Database::getInstance();
// ---- Files laden
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
if ($limit !== null) {
$sql .= " LIMIT " . (int) $limit;
@@ -76,11 +73,12 @@ class MeilisearchFilesParseCommand extends Command
foreach ($files as $file) {
$originalUrl = (string) $file['url'];
$normalized = $originalUrl;
$originalUrl = (string) $file['url'];
$existingTitle = trim((string) ($file['title'] ?? ''));
$normalized = $originalUrl;
// -------------------------------------------------
// 1) ?file=files/…
// Normalize URL
// -------------------------------------------------
if (str_contains($normalized, '?')) {
$parts = parse_url($normalized);
@@ -95,20 +93,10 @@ class MeilisearchFilesParseCommand extends Command
}
}
// -------------------------------------------------
// 2) Fragment entfernen
// -------------------------------------------------
$normalized = strtok($normalized, '#');
// -------------------------------------------------
// 3) URL-Decoding
// -------------------------------------------------
$normalized = rawurldecode($normalized);
// -------------------------------------------------
// 4) Nur lokale files/
// -------------------------------------------------
$normalized = ltrim($normalized, '/');
if (!str_starts_with($normalized, 'files/')) {
$this->log('Not in files/, skip', ['url' => $originalUrl]);
continue;
@@ -128,7 +116,7 @@ class MeilisearchFilesParseCommand extends Command
$checksum = md5($normalized . '|' . $mtime);
// -------------------------------------------------
// 5) Skip unchanged
// Skip unchanged
// -------------------------------------------------
if ($file['checksum'] === $checksum && !empty($file['text'])) {
continue;
@@ -140,7 +128,7 @@ class MeilisearchFilesParseCommand extends Command
}
// -------------------------------------------------
// 6) MIME-Type
// MIME-Type
// -------------------------------------------------
$ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION));
@@ -158,12 +146,12 @@ class MeilisearchFilesParseCommand extends Command
}
// -------------------------------------------------
// 7) Tika parse
// Tika BODY (roher Plaintext)
// -------------------------------------------------
try {
$this->log('Parsing file', ['url' => $normalized]);
$response = $client->request(
$bodyResponse = $client->request(
'PUT',
$tikaUrl . '/tika/main',
[
@@ -175,31 +163,98 @@ class MeilisearchFilesParseCommand extends Command
]
);
$text = trim((string) $response->getContent(false));
$db->prepare(
"UPDATE tl_search_files
SET text = ?, checksum = ?, file_mtime = ?, tstamp = ?
WHERE id = ?"
)->execute(
$text,
$checksum,
$mtime,
time(),
$file['id']
);
$this->log('File parsed', [
'url' => $normalized,
'chars' => mb_strlen($text),
]);
$text = trim((string) $bodyResponse->getContent(false));
} catch (\Throwable $e) {
$this->log('Parse failed', [
$this->log('Body parse failed', [
'url' => $normalized,
'error' => $e->getMessage(),
]);
continue;
}
// -------------------------------------------------
// TITLE: keep existing editor-defined title
// -------------------------------------------------
$title = $existingTitle !== '' ? $existingTitle : null;
// -------------------------------------------------
// Tika METADATA (Title) only if no existing title
// -------------------------------------------------
if ($title === null) {
try {
$metaResponse = $client->request(
'PUT',
$tikaUrl . '/meta',
[
'headers' => [
'Accept' => 'application/json',
'Content-Type' => $mimeType,
],
'body' => fopen($absolutePath, 'rb'),
]
);
$meta = json_decode($metaResponse->getContent(false), true);
$rawTitle =
$meta['dc:title'][0]
?? $meta['pdf:docinfo:title'][0]
?? null;
if ($rawTitle) {
$title = html_entity_decode(
$rawTitle,
ENT_QUOTES | ENT_HTML5,
'UTF-8'
);
}
} catch (\Throwable) {
// Metadata optional
}
}
// -------------------------------------------------
// TITLE → ASCII SAFE (only if newly generated)
// -------------------------------------------------
if ($existingTitle === '' && $title) {
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
$title = preg_replace('/\s+/', ' ', $title);
$title = trim($title);
}
// -------------------------------------------------
// FALLBACK: Dateiname (only if still empty)
// -------------------------------------------------
if (!$title || strlen($title) < 5) {
$title = pathinfo($normalized, PATHINFO_FILENAME);
$title = str_replace(['_', '-'], ' ', $title);
$title = preg_replace('/\s+/', ' ', $title);
$title = trim($title);
}
// -------------------------------------------------
// Store result
// -------------------------------------------------
$db->prepare(
"UPDATE tl_search_files
SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ?
WHERE id = ?"
)->execute(
$text,
$title,
$checksum,
$mtime,
time(),
$file['id']
);
$this->log('File parsed', [
'url' => $normalized,
'chars' => mb_strlen($text),
'title' => $title,
]);
}
$this->log('Parser finished');
@@ -4,6 +4,7 @@ Contao 5 Frontend Module Template
#}
<!-- indexer::stop -->
{% block meilisearch %}
<div
id="topsearch"
class="meilisearch-search"
@@ -218,4 +219,5 @@ Contao 5 Frontend Module Template
}
});
</script>
{% endblock %}
<!-- indexer::continue -->