Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 86b81affdc | |||
| 2d3ddac945 | |||
| 17da2a8434 | |||
| c085911877 | |||
| 40792870bd | |||
| 38372539c2 | |||
| 2bd52f77e0 | |||
| 99ef883da5 |
@@ -22,18 +22,18 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
{
|
{
|
||||||
$this
|
$this
|
||||||
->setName('meilisearch:files:parse')
|
->setName('meilisearch:files:parse')
|
||||||
->setDescription('Parse indexed files via Apache Tika and store extracted text')
|
->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files')
|
||||||
->addOption(
|
->addOption(
|
||||||
'limit',
|
'limit',
|
||||||
null,
|
null,
|
||||||
InputOption::VALUE_OPTIONAL,
|
InputOption::VALUE_OPTIONAL,
|
||||||
'Maximum number of files to check per run (optional)'
|
'Maximum number of files to check per run'
|
||||||
)
|
)
|
||||||
->addOption(
|
->addOption(
|
||||||
'dry-run',
|
'dry-run',
|
||||||
null,
|
null,
|
||||||
InputOption::VALUE_NONE,
|
InputOption::VALUE_NONE,
|
||||||
'Do not send files to Tika, just show what would be parsed'
|
'Do not send files to Tika'
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -44,11 +44,9 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
|
|
||||||
$dryRun = (bool) $input->getOption('dry-run');
|
$dryRun = (bool) $input->getOption('dry-run');
|
||||||
|
|
||||||
// ---- LIMIT: nur wenn explizit gesetzt
|
|
||||||
$limitOption = $input->getOption('limit');
|
$limitOption = $input->getOption('limit');
|
||||||
$limit = $limitOption !== null ? max(1, (int) $limitOption) : null;
|
$limit = $limitOption !== null ? max(1, (int) $limitOption) : null;
|
||||||
|
|
||||||
// ---- Tika URL
|
|
||||||
$tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/');
|
$tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/');
|
||||||
if ($tikaUrl === '') {
|
if ($tikaUrl === '') {
|
||||||
$output->writeln('<error>Tika URL not configured</error>');
|
$output->writeln('<error>Tika URL not configured</error>');
|
||||||
@@ -57,7 +55,6 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
|
|
||||||
$db = Database::getInstance();
|
$db = Database::getInstance();
|
||||||
|
|
||||||
// ---- Files laden
|
|
||||||
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
|
$sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC";
|
||||||
if ($limit !== null) {
|
if ($limit !== null) {
|
||||||
$sql .= " LIMIT " . (int) $limit;
|
$sql .= " LIMIT " . (int) $limit;
|
||||||
@@ -77,10 +74,11 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
foreach ($files as $file) {
|
foreach ($files as $file) {
|
||||||
|
|
||||||
$originalUrl = (string) $file['url'];
|
$originalUrl = (string) $file['url'];
|
||||||
|
$existingTitle = trim((string) ($file['title'] ?? ''));
|
||||||
$normalized = $originalUrl;
|
$normalized = $originalUrl;
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 1) ?file=files/…
|
// Normalize URL
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if (str_contains($normalized, '?')) {
|
if (str_contains($normalized, '?')) {
|
||||||
$parts = parse_url($normalized);
|
$parts = parse_url($normalized);
|
||||||
@@ -95,20 +93,10 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 2) Fragment entfernen
|
|
||||||
// -------------------------------------------------
|
|
||||||
$normalized = strtok($normalized, '#');
|
$normalized = strtok($normalized, '#');
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 3) URL-Decoding
|
|
||||||
// -------------------------------------------------
|
|
||||||
$normalized = rawurldecode($normalized);
|
$normalized = rawurldecode($normalized);
|
||||||
|
|
||||||
// -------------------------------------------------
|
|
||||||
// 4) Nur lokale files/
|
|
||||||
// -------------------------------------------------
|
|
||||||
$normalized = ltrim($normalized, '/');
|
$normalized = ltrim($normalized, '/');
|
||||||
|
|
||||||
if (!str_starts_with($normalized, 'files/')) {
|
if (!str_starts_with($normalized, 'files/')) {
|
||||||
$this->log('Not in files/, skip', ['url' => $originalUrl]);
|
$this->log('Not in files/, skip', ['url' => $originalUrl]);
|
||||||
continue;
|
continue;
|
||||||
@@ -128,7 +116,7 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
$checksum = md5($normalized . '|' . $mtime);
|
$checksum = md5($normalized . '|' . $mtime);
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 5) Skip unchanged
|
// Skip unchanged
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
if ($file['checksum'] === $checksum && !empty($file['text'])) {
|
if ($file['checksum'] === $checksum && !empty($file['text'])) {
|
||||||
continue;
|
continue;
|
||||||
@@ -140,7 +128,7 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 6) MIME-Type
|
// MIME-Type
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
$ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION));
|
$ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION));
|
||||||
|
|
||||||
@@ -158,12 +146,12 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
// 7) Tika parse
|
// Tika BODY (roher Plaintext)
|
||||||
// -------------------------------------------------
|
// -------------------------------------------------
|
||||||
try {
|
try {
|
||||||
$this->log('Parsing file', ['url' => $normalized]);
|
$this->log('Parsing file', ['url' => $normalized]);
|
||||||
|
|
||||||
$response = $client->request(
|
$bodyResponse = $client->request(
|
||||||
'PUT',
|
'PUT',
|
||||||
$tikaUrl . '/tika/main',
|
$tikaUrl . '/tika/main',
|
||||||
[
|
[
|
||||||
@@ -175,14 +163,87 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
$text = trim((string) $response->getContent(false));
|
$text = trim((string) $bodyResponse->getContent(false));
|
||||||
|
|
||||||
|
} catch (\Throwable $e) {
|
||||||
|
$this->log('Body parse failed', [
|
||||||
|
'url' => $normalized,
|
||||||
|
'error' => $e->getMessage(),
|
||||||
|
]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// TITLE: keep existing editor-defined title
|
||||||
|
// -------------------------------------------------
|
||||||
|
$title = $existingTitle !== '' ? $existingTitle : null;
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// Tika METADATA (Title) – only if no existing title
|
||||||
|
// -------------------------------------------------
|
||||||
|
if ($title === null) {
|
||||||
|
try {
|
||||||
|
$metaResponse = $client->request(
|
||||||
|
'PUT',
|
||||||
|
$tikaUrl . '/meta',
|
||||||
|
[
|
||||||
|
'headers' => [
|
||||||
|
'Accept' => 'application/json',
|
||||||
|
'Content-Type' => $mimeType,
|
||||||
|
],
|
||||||
|
'body' => fopen($absolutePath, 'rb'),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
$meta = json_decode($metaResponse->getContent(false), true);
|
||||||
|
|
||||||
|
$rawTitle =
|
||||||
|
$meta['dc:title'][0]
|
||||||
|
?? $meta['pdf:docinfo:title'][0]
|
||||||
|
?? null;
|
||||||
|
|
||||||
|
if ($rawTitle) {
|
||||||
|
$title = html_entity_decode(
|
||||||
|
$rawTitle,
|
||||||
|
ENT_QUOTES | ENT_HTML5,
|
||||||
|
'UTF-8'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (\Throwable) {
|
||||||
|
// Metadata optional
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// TITLE → ASCII SAFE (only if newly generated)
|
||||||
|
// -------------------------------------------------
|
||||||
|
if ($existingTitle === '' && $title) {
|
||||||
|
$title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title);
|
||||||
|
$title = preg_replace('/\s+/', ' ', $title);
|
||||||
|
$title = trim($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// FALLBACK: Dateiname (only if still empty)
|
||||||
|
// -------------------------------------------------
|
||||||
|
if (!$title || strlen($title) < 5) {
|
||||||
|
$title = pathinfo($normalized, PATHINFO_FILENAME);
|
||||||
|
$title = str_replace(['_', '-'], ' ', $title);
|
||||||
|
$title = preg_replace('/\s+/', ' ', $title);
|
||||||
|
$title = trim($title);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------
|
||||||
|
// Store result
|
||||||
|
// -------------------------------------------------
|
||||||
$db->prepare(
|
$db->prepare(
|
||||||
"UPDATE tl_search_files
|
"UPDATE tl_search_files
|
||||||
SET text = ?, checksum = ?, file_mtime = ?, tstamp = ?
|
SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ?
|
||||||
WHERE id = ?"
|
WHERE id = ?"
|
||||||
)->execute(
|
)->execute(
|
||||||
$text,
|
$text,
|
||||||
|
$title,
|
||||||
$checksum,
|
$checksum,
|
||||||
$mtime,
|
$mtime,
|
||||||
time(),
|
time(),
|
||||||
@@ -192,14 +253,8 @@ class MeilisearchFilesParseCommand extends Command
|
|||||||
$this->log('File parsed', [
|
$this->log('File parsed', [
|
||||||
'url' => $normalized,
|
'url' => $normalized,
|
||||||
'chars' => mb_strlen($text),
|
'chars' => mb_strlen($text),
|
||||||
|
'title' => $title,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
|
||||||
$this->log('Parse failed', [
|
|
||||||
'url' => $normalized,
|
|
||||||
'error' => $e->getMessage(),
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->log('Parser finished');
|
$this->log('Parser finished');
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ Contao 5 – Frontend Module Template
|
|||||||
#}
|
#}
|
||||||
|
|
||||||
<!-- indexer::stop -->
|
<!-- indexer::stop -->
|
||||||
|
{% block meilisearch %}
|
||||||
<div
|
<div
|
||||||
id="topsearch"
|
id="topsearch"
|
||||||
class="meilisearch-search"
|
class="meilisearch-search"
|
||||||
@@ -218,4 +219,5 @@ Contao 5 – Frontend Module Template
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
|
{% endblock %}
|
||||||
<!-- indexer::continue -->
|
<!-- indexer::continue -->
|
||||||
Reference in New Issue
Block a user