This commit is contained in:
Jürgen Mummert
2025-12-27 22:41:36 +01:00
parent c645e3caa8
commit e5cce34619
4 changed files with 239 additions and 185 deletions
+29 -37
View File
@@ -20,72 +20,64 @@ class MeilisearchImageHelper
*/ */
public function resolveImagePath(?string $uuid): ?string public function resolveImagePath(?string $uuid): ?string
{ {
error_log('--- MeiliImg START ---');
if (!$uuid) { if (!$uuid) {
error_log('[MeiliImg] UUID leer → return null');
return null; return null;
} }
error_log('[MeiliImg] UUID = ' . $uuid);
// Contao-Framework initialisieren (CLI & Frontend) // Contao-Framework initialisieren (CLI & Frontend)
try {
$this->framework->initialize(); $this->framework->initialize();
error_log('[MeiliImg] Framework initialized'); } catch (\Throwable $e) {
error_log('[ContaoMeilisearch] ImageHelper: Framework init failed: ' . $e->getMessage());
/** @var FilesModel|null $file */
$file = FilesModel::findByUuid($uuid);
if (!$file) {
error_log('[MeiliImg] FilesModel::findByUuid() = NULL');
return null; return null;
} }
error_log('[MeiliImg] FilesModel gefunden'); /** @var FilesModel|null $file */
error_log('[MeiliImg] file->path = ' . $file->path); try {
error_log('[MeiliImg] file->uuid = ' . ($file->uuid ?? '(n/a)')); $file = FilesModel::findByUuid($uuid);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] ImageHelper: FilesModel lookup failed (' . $uuid . '): ' . $e->getMessage()
);
return null;
}
if (!$file) {
error_log('[ContaoMeilisearch] ImageHelper: File not found for UUID ' . $uuid);
return null;
}
// ImageSize aus tl_settings // ImageSize aus tl_settings
$rawSize = Config::get('meilisearch_imagesize'); $imageSizeId = (int) Config::get('meilisearch_imagesize');
$imageSizeId = (int) $rawSize;
error_log('[MeiliImg] meilisearch_imagesize raw = ' . var_export($rawSize, true));
error_log('[MeiliImg] meilisearch_imagesize int = ' . $imageSizeId);
// Fallback: Originaldatei // Fallback: Originaldatei
if ($imageSizeId <= 0) { if ($imageSizeId <= 0) {
error_log('[MeiliImg] imageSizeId <= 0 → FALLBACK file->path = ' . $file->path);
error_log('--- MeiliImg END ---');
return $file->path; return $file->path;
} }
try { try {
$builder = $this->studio $figure = $this->studio
->createFigureBuilder() ->createFigureBuilder()
->from($file->path) ->from($file->path)
->setSize($imageSizeId); ->setSize($imageSizeId)
->build();
error_log('[MeiliImg] FigureBuilder erstellt (from=' . $file->path . ', size=' . $imageSizeId . ')');
$figure = $builder->build();
error_log('[MeiliImg] Figure build() OK');
$image = $figure->getImage(); $image = $figure->getImage();
if ($image === null) { if ($image === null) {
error_log('[MeiliImg] figure->getImage() = NULL'); error_log(
'[ContaoMeilisearch] ImageHelper: Image generation failed for ' . $file->path
);
return null; return null;
} }
$src = $image->getImageSrc(); return $image->getImageSrc() ?: null;
error_log('[MeiliImg] image->getImageSrc() = ' . $src);
return $src ?: null;
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('[MeiliImg] EXCEPTION ' . get_class($e) . ': ' . $e->getMessage()); error_log(
error_log('--- MeiliImg END ---'); '[ContaoMeilisearch] ImageHelper: Image processing failed for '
. $file->path . ': ' . $e->getMessage()
);
return null; return null;
} }
} }
+62 -3
View File
@@ -34,26 +34,48 @@ class MeilisearchIndexService
*/ */
public function run(): void public function run(): void
{ {
try {
$this->framework->initialize(); $this->framework->initialize();
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Framework initialization failed: ' . $e->getMessage());
return;
}
$host = (string) Config::get('meilisearch_host'); $host = (string) Config::get('meilisearch_host');
$apiKey = (string) Config::get('meilisearch_api_write'); $apiKey = (string) Config::get('meilisearch_api_write');
$this->indexName = (string) Config::get('meilisearch_index'); $this->indexName = (string) Config::get('meilisearch_index');
if ($host === '' || $this->indexName === '') { if ($host === '' || $this->indexName === '') {
throw new \RuntimeException('Meilisearch is not configured in tl_settings.'); error_log('[ContaoMeilisearch] Meilisearch is not configured in tl_settings.');
return;
} }
try {
$this->client = new Client($host, $apiKey); $this->client = new Client($host, $apiKey);
$index = $this->client->index($this->indexName); $index = $this->client->index($this->indexName);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to connect to Meilisearch: ' . $e->getMessage());
return;
}
try { try {
$index->updateSettings(['primaryKey' => 'id']); $index->updateSettings(['primaryKey' => 'id']);
} catch (\Throwable) {} } catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to set primaryKey: ' . $e->getMessage());
}
try {
$this->ensureIndexSettings($index); $this->ensureIndexSettings($index);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to update index settings: ' . $e->getMessage());
}
try {
$index->deleteAllDocuments(); $index->deleteAllDocuments();
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to delete documents: ' . $e->getMessage());
return;
}
$this->indexTlSearch($index); $this->indexTlSearch($index);
$this->indexTlSearchPdf($index); $this->indexTlSearchPdf($index);
@@ -78,7 +100,6 @@ class MeilisearchIndexService
$text $text
); );
// Text normalisieren
$text = preg_replace('/\s{2,}/u', ' ', $text); $text = preg_replace('/\s{2,}/u', ' ', $text);
$text = preg_replace('/\n{2,}/u', "\n", $text); $text = preg_replace('/\n{2,}/u', "\n", $text);
@@ -121,7 +142,13 @@ class MeilisearchIndexService
*/ */
private function indexTlSearch(Indexes $index): void private function indexTlSearch(Indexes $index): void
{ {
try {
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search'); $rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to read tl_search: ' . $e->getMessage());
return;
}
if (!$rows) { if (!$rows) {
return; return;
} }
@@ -132,6 +159,7 @@ class MeilisearchIndexService
$documents = []; $documents = [];
foreach ($rows as $row) { foreach ($rows as $row) {
try {
$type = $this->detectTypeFromMeta($row['meta'] ?? null); $type = $this->detectTypeFromMeta($row['meta'] ?? null);
$eventStart = null; $eventStart = null;
@@ -168,10 +196,21 @@ class MeilisearchIndexService
} }
$documents[] = $doc; $documents[] = $doc;
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to build document for tl_search ID '
. ($row['id'] ?? '?') . ': ' . $e->getMessage()
);
}
} }
if ($documents !== []) { if ($documents !== []) {
try {
$index->addDocuments($documents); $index->addDocuments($documents);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to add tl_search documents: ' . $e->getMessage());
}
} }
} }
@@ -180,7 +219,13 @@ class MeilisearchIndexService
*/ */
private function indexTlSearchPdf(Indexes $index): void private function indexTlSearchPdf(Indexes $index): void
{ {
try {
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search_pdf'); $rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search_pdf');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to read tl_search_pdf: ' . $e->getMessage());
return;
}
if (!$rows) { if (!$rows) {
return; return;
} }
@@ -188,6 +233,7 @@ class MeilisearchIndexService
$documents = []; $documents = [];
foreach ($rows as $row) { foreach ($rows as $row) {
try {
$fileType = in_array($row['type'], ['pdf', 'docx', 'xlsx', 'pptx'], true) $fileType = in_array($row['type'], ['pdf', 'docx', 'xlsx', 'pptx'], true)
? $row['type'] ? $row['type']
: 'pdf'; : 'pdf';
@@ -202,9 +248,22 @@ class MeilisearchIndexService
'poster' => self::FILETYPE_ICON_MAP[$fileType] 'poster' => self::FILETYPE_ICON_MAP[$fileType]
?? self::FILETYPE_ICON_MAP['pdf'], ?? self::FILETYPE_ICON_MAP['pdf'],
]; ];
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to build PDF document for ID '
. ($row['id'] ?? '?') . ': ' . $e->getMessage()
);
}
} }
if ($documents !== []) {
try {
$index->addDocuments($documents); $index->addDocuments($documents);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to add tl_search_pdf documents: ' . $e->getMessage());
}
}
} }
private function detectTypeFromMeta(?string $meta): string private function detectTypeFromMeta(?string $meta): string
+26 -24
View File
@@ -34,19 +34,15 @@ class OfficeIndexService
} }
try { try {
error_log('bearbeite Office-Datei: ' . $url);
// innerhalb des Crawls gleiche URL nicht mehrfach parsen // innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($this->seenThisCrawl[$seenKey])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $this->seenThisCrawl[$seenKey] = true;
$normalized = $this->normalizeOfficeUrl($url); $normalized = $this->normalizeOfficeUrl($url);
if ($normalized === null) { if ($normalized === null) {
error_log('→ übersprungen: kein gültiger Office-Pfad');
continue; continue;
} }
@@ -54,7 +50,6 @@ class OfficeIndexService
$absolutePath = $this->getAbsolutePath($relativePath); $absolutePath = $this->getAbsolutePath($relativePath);
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
continue; continue;
} }
@@ -65,7 +60,6 @@ class OfficeIndexService
$text = $this->parseOfficeFile($absolutePath, $type); $text = $this->parseOfficeFile($absolutePath, $type);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: Office-Datei ohne Textinhalt');
continue; continue;
} }
@@ -78,10 +72,10 @@ class OfficeIndexService
$type $type
); );
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('Office Service FEHLER: ' . $e->getMessage()); error_log(
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
);
} }
} }
} }
@@ -107,11 +101,7 @@ class OfficeIndexService
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
if (!empty($query['p'])) { if (!empty($query['p'])) {
$p = (string) $query['p']; $p = urldecode((string) $query['p']);
// Query-Parameter korrekt dekodieren
$p = urldecode($p);
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
@@ -136,9 +126,9 @@ class OfficeIndexService
int $mtime, int $mtime,
string $type string $type
): void { ): void {
$db = Database::getInstance(); try {
Database::getInstance()
$db->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, type, url, title, text, checksum, file_mtime) (tstamp, type, url, title, text, checksum, file_mtime)
VALUES VALUES
@@ -150,7 +140,8 @@ class OfficeIndexService
title=VALUES(title), title=VALUES(title),
text=VALUES(text), text=VALUES(text),
file_mtime=VALUES(file_mtime) file_mtime=VALUES(file_mtime)
')->execute( ')
->execute(
time(), time(),
$type, $type,
$url, $url,
@@ -159,6 +150,11 @@ class OfficeIndexService
$checksum, $checksum,
$mtime $mtime
); );
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
);
}
} }
private function parseOfficeFile(string $absolutePath, string $type): string private function parseOfficeFile(string $absolutePath, string $type): string
@@ -186,8 +182,10 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) {
} catch (\Throwable) { error_log(
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -205,8 +203,10 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) {
} catch (\Throwable) { error_log(
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -226,8 +226,10 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) {
} catch (\Throwable) { error_log(
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
+33 -32
View File
@@ -23,7 +23,6 @@ class PdfIndexService
/** /**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen. * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
* MUSS IMMER laufen (auch wenn Checkbox später aus ist).
*/ */
public function resetTableOnce(): void public function resetTableOnce(): void
{ {
@@ -34,10 +33,11 @@ class PdfIndexService
$this->didReset = true; $this->didReset = true;
$this->seenThisCrawl = []; $this->seenThisCrawl = [];
// bei <=100 PDFs: sauber & simpel try {
Database::getInstance()->execute('TRUNCATE tl_search_pdf'); Database::getInstance()->execute('TRUNCATE tl_search_pdf');
} catch (\Throwable $e) {
error_log('PDF Reset: tl_search_pdf geleert (TRUNCATE)'); error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
}
} }
/** /**
@@ -54,25 +54,20 @@ class PdfIndexService
} }
try { try {
error_log('bearbeite PDF: ' . $url); // innerhalb des Crawls gleiche URL nicht mehrfach parsen
// innerhalb des Crawls gleiche URL nicht 20x parsen (News-Teaser etc.)
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($this->seenThisCrawl[$seenKey])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $this->seenThisCrawl[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
if ($normalizedPath === null) { if ($normalizedPath === null) {
error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($normalizedPath); $absolutePath = $this->getAbsolutePath($normalizedPath);
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
continue; continue;
} }
@@ -88,7 +83,6 @@ class PdfIndexService
$text = $this->parsePdf($absolutePath); $text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: PDF ohne Textinhalt');
continue; continue;
} }
@@ -100,10 +94,10 @@ class PdfIndexService
$mtime $mtime
); );
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Service FEHLER: ' . $e->getMessage()); error_log(
'[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
);
} }
} }
} }
@@ -118,8 +112,12 @@ class PdfIndexService
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
// Fall 2: absolute URL auf gleiche Site -> Pfad extrahieren // Fall 2: absolute URL auf gleiche Site
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) { if (
!empty($parts['path'])
&& str_starts_with($parts['path'], '/files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
return $parts['path']; return $parts['path'];
} }
@@ -131,13 +129,7 @@ class PdfIndexService
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
if (!empty($query['p'])) { if (!empty($query['p'])) {
$p = (string) $query['p']; $p = urldecode((string) $query['p']);
// Query-Parameter korrekt dekodieren
$p = urldecode($p);
// deine Links enthalten oft "pdf/DATEI.pdf"
// => wird zu "/files/pdf/DATEI.pdf"
return '/files/' . ltrim($p, '/'); return '/files/' . ltrim($p, '/');
} }
@@ -151,10 +143,9 @@ class PdfIndexService
private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
{ {
$db = Database::getInstance(); try {
Database::getInstance()
// wichtig: UNIQUE(checksum) -> entweder INSERT oder UPDATE ->prepare('
$db->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, url, title, text, checksum, file_mtime)
VALUES VALUES
@@ -165,7 +156,8 @@ class PdfIndexService
title=VALUES(title), title=VALUES(title),
text=VALUES(text), text=VALUES(text),
file_mtime=VALUES(file_mtime) file_mtime=VALUES(file_mtime)
')->execute( ')
->execute(
time(), time(),
$url, $url,
$title, $title,
@@ -173,6 +165,11 @@ class PdfIndexService
$checksum, $checksum,
$mtime $mtime
); );
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
);
}
} }
private function parsePdf(string $absolutePath): string private function parsePdf(string $absolutePath): string
@@ -184,8 +181,10 @@ class PdfIndexService
$text = $this->cleanPdfContent($pdf->getText()); $text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 20000); return mb_substr($text, 0, 20000);
} catch (\Throwable $e) {
} catch (\Throwable) { error_log(
'[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -221,8 +220,10 @@ class PdfIndexService
} }
} }
} }
} catch (\Throwable) { } catch (\Throwable $e) {
// ignore error_log(
'[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
);
} }
return null; return null;