This commit is contained in:
Jürgen Mummert
2025-12-27 22:41:36 +01:00
parent c645e3caa8
commit e5cce34619
4 changed files with 239 additions and 185 deletions
+30 -38
View File
@@ -20,72 +20,64 @@ class MeilisearchImageHelper
*/
public function resolveImagePath(?string $uuid): ?string
{
error_log('--- MeiliImg START ---');
if (!$uuid) {
error_log('[MeiliImg] UUID leer → return null');
return null;
}
error_log('[MeiliImg] UUID = ' . $uuid);
// Contao-Framework initialisieren (CLI & Frontend)
$this->framework->initialize();
error_log('[MeiliImg] Framework initialized');
/** @var FilesModel|null $file */
$file = FilesModel::findByUuid($uuid);
if (!$file) {
error_log('[MeiliImg] FilesModel::findByUuid() = NULL');
try {
$this->framework->initialize();
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] ImageHelper: Framework init failed: ' . $e->getMessage());
return null;
}
error_log('[MeiliImg] FilesModel gefunden');
error_log('[MeiliImg] file->path = ' . $file->path);
error_log('[MeiliImg] file->uuid = ' . ($file->uuid ?? '(n/a)'));
/** @var FilesModel|null $file */
try {
$file = FilesModel::findByUuid($uuid);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] ImageHelper: FilesModel lookup failed (' . $uuid . '): ' . $e->getMessage()
);
return null;
}
if (!$file) {
error_log('[ContaoMeilisearch] ImageHelper: File not found for UUID ' . $uuid);
return null;
}
// ImageSize aus tl_settings
$rawSize = Config::get('meilisearch_imagesize');
$imageSizeId = (int) $rawSize;
error_log('[MeiliImg] meilisearch_imagesize raw = ' . var_export($rawSize, true));
error_log('[MeiliImg] meilisearch_imagesize int = ' . $imageSizeId);
$imageSizeId = (int) Config::get('meilisearch_imagesize');
// Fallback: Originaldatei
if ($imageSizeId <= 0) {
error_log('[MeiliImg] imageSizeId <= 0 → FALLBACK file->path = ' . $file->path);
error_log('--- MeiliImg END ---');
return $file->path;
}
try {
$builder = $this->studio
$figure = $this->studio
->createFigureBuilder()
->from($file->path)
->setSize($imageSizeId);
error_log('[MeiliImg] FigureBuilder erstellt (from=' . $file->path . ', size=' . $imageSizeId . ')');
$figure = $builder->build();
error_log('[MeiliImg] Figure build() OK');
->setSize($imageSizeId)
->build();
$image = $figure->getImage();
if ($image === null) {
error_log('[MeiliImg] figure->getImage() = NULL');
error_log(
'[ContaoMeilisearch] ImageHelper: Image generation failed for ' . $file->path
);
return null;
}
$src = $image->getImageSrc();
error_log('[MeiliImg] image->getImageSrc() = ' . $src);
return $src ?: null;
return $image->getImageSrc() ?: null;
} catch (\Throwable $e) {
error_log('[MeiliImg] EXCEPTION ' . get_class($e) . ': ' . $e->getMessage());
error_log('--- MeiliImg END ---');
error_log(
'[ContaoMeilisearch] ImageHelper: Image processing failed for '
. $file->path . ': ' . $e->getMessage()
);
return null;
}
}
+113 -54
View File
@@ -34,26 +34,48 @@ class MeilisearchIndexService
*/
public function run(): void
{
$this->framework->initialize();
try {
$this->framework->initialize();
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Framework initialization failed: ' . $e->getMessage());
return;
}
$host = (string) Config::get('meilisearch_host');
$apiKey = (string) Config::get('meilisearch_api_write');
$this->indexName = (string) Config::get('meilisearch_index');
if ($host === '' || $this->indexName === '') {
throw new \RuntimeException('Meilisearch is not configured in tl_settings.');
error_log('[ContaoMeilisearch] Meilisearch is not configured in tl_settings.');
return;
}
$this->client = new Client($host, $apiKey);
$index = $this->client->index($this->indexName);
try {
$this->client = new Client($host, $apiKey);
$index = $this->client->index($this->indexName);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to connect to Meilisearch: ' . $e->getMessage());
return;
}
try {
$index->updateSettings(['primaryKey' => 'id']);
} catch (\Throwable) {}
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to set primaryKey: ' . $e->getMessage());
}
$this->ensureIndexSettings($index);
try {
$this->ensureIndexSettings($index);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to update index settings: ' . $e->getMessage());
}
$index->deleteAllDocuments();
try {
$index->deleteAllDocuments();
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to delete documents: ' . $e->getMessage());
return;
}
$this->indexTlSearch($index);
$this->indexTlSearchPdf($index);
@@ -78,7 +100,6 @@ class MeilisearchIndexService
$text
);
// Text normalisieren
$text = preg_replace('/\s{2,}/u', ' ', $text);
$text = preg_replace('/\n{2,}/u', "\n", $text);
@@ -121,7 +142,13 @@ class MeilisearchIndexService
*/
private function indexTlSearch(Indexes $index): void
{
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search');
try {
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to read tl_search: ' . $e->getMessage());
return;
}
if (!$rows) {
return;
}
@@ -132,46 +159,58 @@ class MeilisearchIndexService
$documents = [];
foreach ($rows as $row) {
$type = $this->detectTypeFromMeta($row['meta'] ?? null);
try {
$type = $this->detectTypeFromMeta($row['meta'] ?? null);
$eventStart = null;
if ($type === 'event') {
$eventStart = $this->extractEventStartDate($row['meta'] ?? null);
if (!$indexPastEvents && $eventStart !== null && $eventStart < $today) {
continue;
$eventStart = null;
if ($type === 'event') {
$eventStart = $this->extractEventStartDate($row['meta'] ?? null);
if (!$indexPastEvents && $eventStart !== null && $eventStart < $today) {
continue;
}
}
}
$cleanText = $this->stripMeilisearchMeta((string) $row['text']);
$cleanText = $this->stripMeilisearchMeta((string) $row['text']);
$doc = [
'id' => $type . '_' . $row['id'],
'type' => $type,
'title' => $row['title'],
'text' => $cleanText,
'url' => $row['url'],
'protected' => (bool) $row['protected'],
'checksum' => $row['checksum'],
'keywords' => (string) ($row['keywords'] ?? ''),
'priority' => (int) ($row['priority'] ?? 0),
];
$doc = [
'id' => $type . '_' . $row['id'],
'type' => $type,
'title' => $row['title'],
'text' => $cleanText,
'url' => $row['url'],
'protected' => (bool) $row['protected'],
'checksum' => $row['checksum'],
'keywords' => (string) ($row['keywords'] ?? ''),
'priority' => (int) ($row['priority'] ?? 0),
];
if ($eventStart !== null) {
$doc['startDate'] = $eventStart;
}
if (!empty($row['imagepath'])) {
$imagePath = $this->imageHelper->resolveImagePath($row['imagepath']);
if ($imagePath !== null) {
$doc['poster'] = $imagePath;
if ($eventStart !== null) {
$doc['startDate'] = $eventStart;
}
}
$documents[] = $doc;
if (!empty($row['imagepath'])) {
$imagePath = $this->imageHelper->resolveImagePath($row['imagepath']);
if ($imagePath !== null) {
$doc['poster'] = $imagePath;
}
}
$documents[] = $doc;
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to build document for tl_search ID '
. ($row['id'] ?? '?') . ': ' . $e->getMessage()
);
}
}
if ($documents !== []) {
$index->addDocuments($documents);
try {
$index->addDocuments($documents);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to add tl_search documents: ' . $e->getMessage());
}
}
}
@@ -180,7 +219,13 @@ class MeilisearchIndexService
*/
private function indexTlSearchPdf(Indexes $index): void
{
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search_pdf');
try {
$rows = $this->connection->fetchAllAssociative('SELECT * FROM tl_search_pdf');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to read tl_search_pdf: ' . $e->getMessage());
return;
}
if (!$rows) {
return;
}
@@ -188,23 +233,37 @@ class MeilisearchIndexService
$documents = [];
foreach ($rows as $row) {
$fileType = in_array($row['type'], ['pdf', 'docx', 'xlsx', 'pptx'], true)
? $row['type']
: 'pdf';
try {
$fileType = in_array($row['type'], ['pdf', 'docx', 'xlsx', 'pptx'], true)
? $row['type']
: 'pdf';
$documents[] = [
'id' => $fileType . '_' . $row['id'],
'type' => $fileType,
'title' => $row['title'],
'text' => $this->stripMeilisearchMeta((string) $row['text']),
'url' => $row['url'],
'checksum' => $row['checksum'],
'poster' => self::FILETYPE_ICON_MAP[$fileType]
?? self::FILETYPE_ICON_MAP['pdf'],
];
$documents[] = [
'id' => $fileType . '_' . $row['id'],
'type' => $fileType,
'title' => $row['title'],
'text' => $this->stripMeilisearchMeta((string) $row['text']),
'url' => $row['url'],
'checksum' => $row['checksum'],
'poster' => self::FILETYPE_ICON_MAP[$fileType]
?? self::FILETYPE_ICON_MAP['pdf'],
];
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to build PDF document for ID '
. ($row['id'] ?? '?') . ': ' . $e->getMessage()
);
}
}
$index->addDocuments($documents);
if ($documents !== []) {
try {
$index->addDocuments($documents);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to add tl_search_pdf documents: ' . $e->getMessage());
}
}
}
private function detectTypeFromMeta(?string $meta): string
+45 -43
View File
@@ -34,19 +34,15 @@ class OfficeIndexService
}
try {
error_log('bearbeite Office-Datei: ' . $url);
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue;
}
$this->seenThisCrawl[$seenKey] = true;
$normalized = $this->normalizeOfficeUrl($url);
if ($normalized === null) {
error_log('→ übersprungen: kein gültiger Office-Pfad');
continue;
}
@@ -54,7 +50,6 @@ class OfficeIndexService
$absolutePath = $this->getAbsolutePath($relativePath);
if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
continue;
}
@@ -65,7 +60,6 @@ class OfficeIndexService
$text = $this->parseOfficeFile($absolutePath, $type);
if ($text === '') {
error_log('→ übersprungen: Office-Datei ohne Textinhalt');
continue;
}
@@ -78,10 +72,10 @@ class OfficeIndexService
$type
);
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) {
error_log('Office Service FEHLER: ' . $e->getMessage());
error_log(
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
);
}
}
}
@@ -107,11 +101,7 @@ class OfficeIndexService
parse_str($parts['query'], $query);
if (!empty($query['p'])) {
$p = (string) $query['p'];
// Query-Parameter korrekt dekodieren
$p = urldecode($p);
$p = urldecode((string) $query['p']);
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
@@ -136,29 +126,35 @@ class OfficeIndexService
int $mtime,
string $type
): void {
$db = Database::getInstance();
$db->prepare('
INSERT INTO tl_search_pdf
(tstamp, type, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp),
type=VALUES(type),
url=VALUES(url),
title=VALUES(title),
text=VALUES(text),
file_mtime=VALUES(file_mtime)
')->execute(
time(),
$type,
$url,
$title,
$text,
$checksum,
$mtime
);
try {
Database::getInstance()
->prepare('
INSERT INTO tl_search_pdf
(tstamp, type, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp),
type=VALUES(type),
url=VALUES(url),
title=VALUES(title),
text=VALUES(text),
file_mtime=VALUES(file_mtime)
')
->execute(
time(),
$type,
$url,
$title,
$text,
$checksum,
$mtime
);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
);
}
}
private function parseOfficeFile(string $absolutePath, string $type): string
@@ -186,8 +182,10 @@ class OfficeIndexService
}
return $this->cleanText($text);
} catch (\Throwable) {
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
);
return '';
}
}
@@ -205,8 +203,10 @@ class OfficeIndexService
}
return $this->cleanText($text);
} catch (\Throwable) {
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
);
return '';
}
}
@@ -226,8 +226,10 @@ class OfficeIndexService
}
return $this->cleanText($text);
} catch (\Throwable) {
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
);
return '';
}
}
+51 -50
View File
@@ -23,7 +23,6 @@ class PdfIndexService
/**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
* MUSS IMMER laufen (auch wenn Checkbox später aus ist).
*/
public function resetTableOnce(): void
{
@@ -34,10 +33,11 @@ class PdfIndexService
$this->didReset = true;
$this->seenThisCrawl = [];
// bei <=100 PDFs: sauber & simpel
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
error_log('PDF Reset: tl_search_pdf geleert (TRUNCATE)');
try {
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
}
}
/**
@@ -54,25 +54,20 @@ class PdfIndexService
}
try {
error_log('bearbeite PDF: ' . $url);
// innerhalb des Crawls gleiche URL nicht 20x parsen (News-Teaser etc.)
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue;
}
$this->seenThisCrawl[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url);
if ($normalizedPath === null) {
error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue;
}
$absolutePath = $this->getAbsolutePath($normalizedPath);
if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
continue;
}
@@ -88,7 +83,6 @@ class PdfIndexService
$text = $this->parsePdf($absolutePath);
if ($text === '') {
error_log('→ übersprungen: PDF ohne Textinhalt');
continue;
}
@@ -100,10 +94,10 @@ class PdfIndexService
$mtime
);
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) {
error_log('PDF Service FEHLER: ' . $e->getMessage());
error_log(
'[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
);
}
}
}
@@ -118,8 +112,12 @@ class PdfIndexService
$decoded = html_entity_decode($url);
$parts = parse_url($decoded);
// Fall 2: absolute URL auf gleiche Site -> Pfad extrahieren
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
// Fall 2: absolute URL auf gleiche Site
if (
!empty($parts['path'])
&& str_starts_with($parts['path'], '/files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
return $parts['path'];
}
@@ -131,13 +129,7 @@ class PdfIndexService
parse_str($parts['query'], $query);
if (!empty($query['p'])) {
$p = (string) $query['p'];
// Query-Parameter korrekt dekodieren
$p = urldecode($p);
// deine Links enthalten oft "pdf/DATEI.pdf"
// => wird zu "/files/pdf/DATEI.pdf"
$p = urldecode((string) $query['p']);
return '/files/' . ltrim($p, '/');
}
@@ -151,28 +143,33 @@ class PdfIndexService
private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
{
$db = Database::getInstance();
// wichtig: UNIQUE(checksum) -> entweder INSERT oder UPDATE
$db->prepare('
INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp),
url=VALUES(url),
title=VALUES(title),
text=VALUES(text),
file_mtime=VALUES(file_mtime)
')->execute(
time(),
$url,
$title,
$text,
$checksum,
$mtime
);
try {
Database::getInstance()
->prepare('
INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp),
url=VALUES(url),
title=VALUES(title),
text=VALUES(text),
file_mtime=VALUES(file_mtime)
')
->execute(
time(),
$url,
$title,
$text,
$checksum,
$mtime
);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
);
}
}
private function parsePdf(string $absolutePath): string
@@ -184,8 +181,10 @@ class PdfIndexService
$text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 20000);
} catch (\Throwable) {
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
);
return '';
}
}
@@ -221,8 +220,10 @@ class PdfIndexService
}
}
}
} catch (\Throwable) {
// ignore
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
);
}
return null;