diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index 4fb0dc5..86733c9 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -15,18 +15,92 @@ class IndexPageListener
public function onIndexPage(string $content, array &$data, array &$set): void
{
- fwrite(STDERR, "\n[Meili DEBUG] onIndexPage() called\n");
-
/*
* =====================
* PDF: Reset genau 1× pro Crawl
* =====================
*/
try {
- fwrite(STDERR, "[Meili DEBUG] resetTableOnce()\n");
$this->pdfIndexService->resetTableOnce();
} catch (\Throwable $e) {
- fwrite(STDERR, "[Meili DEBUG] PDF reset failed: {$e->getMessage()}\n");
+ error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
+ }
+
+ /*
+ * =====================
+ * SEITEN-METADATEN
+ * =====================
+ */
+ if (str_contains($content, 'MEILISEARCH_JSON')) {
+ try {
+ $parsed = $this->extractMeilisearchJson($content);
+ } catch (\Throwable $e) {
+ error_log('[ContaoMeilisearch] Failed to extract MEILISEARCH_JSON: ' . $e->getMessage());
+ $parsed = null;
+ }
+
+ if (is_array($parsed)) {
+
+ // PRIORITY
+ $priority =
+ $parsed['event']['priority']
+ ?? $parsed['news']['priority']
+ ?? $parsed['page']['priority']
+ ?? null;
+
+ if ($priority !== null && $priority !== '') {
+ $set['priority'] = (int) $priority;
+ }
+
+ // KEYWORDS
+ $keywordSources = [
+ $parsed['event']['keywords'] ?? null,
+ $parsed['news']['keywords'] ?? null,
+ $parsed['page']['keywords'] ?? null,
+ ];
+
+ $keywords = [];
+ foreach ($keywordSources as $src) {
+ if (!is_string($src) || trim($src) === '') {
+ continue;
+ }
+ foreach (preg_split('/\s+/', trim($src)) as $word) {
+ $keywords[] = $word;
+ }
+ }
+
+ if ($keywords) {
+ $set['keywords'] = implode(' ', array_unique($keywords));
+ }
+
+ // IMAGEPATH
+ if (!empty($parsed['page']['searchimage'])) {
+ $set['imagepath'] = trim((string) $parsed['page']['searchimage']);
+ }
+
+ // STARTDATE
+ $startDate =
+ $parsed['event']['startDate']
+ ?? $parsed['news']['startDate']
+ ?? null;
+
+ if (is_numeric($startDate) && (int) $startDate > 0) {
+ $set['startDate'] = (int) $startDate;
+ }
+
+ // CHECKSUM
+ try {
+ $checksumSeed = (string) ($data['checksum'] ?? '');
+ $checksumSeed .= '|' . ($set['keywords'] ?? '');
+ $checksumSeed .= '|' . ($set['priority'] ?? '');
+ $checksumSeed .= '|' . ($set['imagepath'] ?? '');
+ $checksumSeed .= '|' . ($set['startDate'] ?? '');
+
+ $set['checksum'] = md5($checksumSeed);
+ } catch (\Throwable $e) {
+ error_log('[ContaoMeilisearch] Failed to generate checksum: ' . $e->getMessage());
+ }
+ }
}
/*
@@ -35,91 +109,67 @@ class IndexPageListener
* =====================
*/
if ((int) ($data['protected'] ?? 0) !== 0) {
- fwrite(STDERR, "[Meili DEBUG] Page is protected → skip files\n");
return;
}
$indexPdfs = (bool) Config::get('meilisearch_index_pdfs');
$indexOffice = (bool) Config::get('meilisearch_index_office');
- fwrite(
- STDERR,
- "[Meili DEBUG] Settings: pdfs="
- . ($indexPdfs ? '1' : '0')
- . " office="
- . ($indexOffice ? '1' : '0')
- . "\n"
- );
-
if (!$indexPdfs && !$indexOffice) {
- fwrite(STDERR, "[Meili DEBUG] No file indexing enabled → return\n");
return;
}
$links = $this->findAllLinks($content);
- fwrite(STDERR, "[Meili DEBUG] Found " . count($links) . " links\n");
$pdfLinks = [];
$officeLinks = [];
foreach ($links as $link) {
- fwrite(STDERR, "[Meili DEBUG] URL: {$link['url']}\n");
-
$type = $this->detectIndexableFileType($link['url']);
- fwrite(
- STDERR,
- "[Meili DEBUG] → detected type: "
- . ($type ?? 'none')
- . "\n"
- );
- if ($type === 'pdf') {
- if ($indexPdfs) {
- fwrite(STDERR, "[Meili DEBUG] → add to PDF queue\n");
- $pdfLinks[] = $link;
- } else {
- fwrite(STDERR, "[Meili DEBUG] → PDF indexing disabled\n");
- }
+ if ($type === 'pdf' && $indexPdfs) {
+ $pdfLinks[] = $link;
continue;
}
- if (in_array($type, ['docx', 'xlsx', 'pptx'], true)) {
- if ($indexOffice) {
- fwrite(STDERR, "[Meili DEBUG] → add to OFFICE queue\n");
- $officeLinks[] = $link;
- } else {
- fwrite(STDERR, "[Meili DEBUG] → Office indexing disabled\n");
- }
- continue;
+ if (
+ in_array($type, ['docx', 'xlsx', 'pptx'], true)
+ && $indexOffice
+ ) {
+ $officeLinks[] = $link;
}
-
- fwrite(STDERR, "[Meili DEBUG] → ignored\n");
}
- fwrite(
- STDERR,
- "[Meili DEBUG] Final queues: pdf="
- . count($pdfLinks)
- . " office="
- . count($officeLinks)
- . "\n"
- );
-
try {
if ($pdfLinks !== []) {
- fwrite(STDERR, "[Meili DEBUG] Calling handlePdfLinks()\n");
$this->pdfIndexService->handlePdfLinks($pdfLinks);
}
if ($officeLinks !== []) {
- fwrite(STDERR, "[Meili DEBUG] Calling handleOfficeLinks()\n");
$this->officeIndexService->handleOfficeLinks($officeLinks);
}
} catch (\Throwable $e) {
- fwrite(STDERR, "[Meili DEBUG] File indexing failed: {$e->getMessage()}\n");
+ error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage());
}
}
+ /**
+ * Extrahiert MEILISEARCH_JSON aus HTML-Kommentar
+ */
+ private function extractMeilisearchJson(string $content): ?array
+ {
+ if (!preg_match('//s', $content, $m)) {
+ return null;
+ }
+
+ $json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
+ $data = json_decode($json, true);
+
+ return json_last_error() === JSON_ERROR_NONE && is_array($data)
+ ? $data
+ : null;
+ }
+
/**
* Sammle alle Links
*/
@@ -150,39 +200,35 @@ class IndexPageListener
*/
private function detectIndexableFileType(string $url): ?string
{
- fwrite(STDERR, "[Meili DEBUG] detectIndexableFileType(): $url\n");
-
+ // Hash entfernen
$url = strtok($url, '#');
- $parts = parse_url($url);
+ $parts = parse_url($url);
if (!$parts) {
- fwrite(STDERR, "[Meili DEBUG] → parse_url failed\n");
return null;
}
+ // direkter Pfad (/files/…)
if (!empty($parts['path'])) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
- fwrite(STDERR, "[Meili DEBUG] → path ext: $ext\n");
-
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext;
}
}
+ // Query-Parameter (Contao 4 + 5)
if (!empty($parts['query'])) {
parse_str($parts['query'], $query);
foreach (['file', 'p', 'f'] as $param) {
if (!empty($query[$param])) {
- $candidate = rawurldecode(
- html_entity_decode((string) $query[$param], ENT_QUOTES)
- );
+ $candidate = (string) $query[$param];
+
+ // sicher decodieren (Contao 4 + 5)
+ $candidate = html_entity_decode($candidate, ENT_QUOTES);
+ $candidate = rawurldecode($candidate);
$ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
- fwrite(
- STDERR,
- "[Meili DEBUG] → query $param=$candidate ext=$ext\n"
- );
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext;
diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php
index f689c91..57fa73e 100644
--- a/src/Service/PdfIndexService.php
+++ b/src/Service/PdfIndexService.php
@@ -10,119 +10,148 @@ class PdfIndexService
{
private string $projectDir;
- // pro PHP-Process genau 1x resetten
private bool $didReset = false;
-
- // pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params)
{
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
+ fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n");
}
- /**
- * Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
- */
public function resetTableOnce(): void
{
if ($this->didReset) {
+ fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n");
return;
}
+ fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n");
+
$this->didReset = true;
$this->seenThisCrawl = [];
try {
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
} catch (\Throwable $e) {
- error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
+ fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n");
}
}
- /**
- * @param array $pdfLinks
- */
public function handlePdfLinks(array $pdfLinks): void
{
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n"
+ );
+
foreach ($pdfLinks as $row) {
$url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null;
+ fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n");
+
if ($url === '') {
+ fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n");
continue;
}
- try {
- // innerhalb des Crawls gleiche URL nicht mehrfach parsen
- $seenKey = md5($url);
- if (isset($this->seenThisCrawl[$seenKey])) {
- continue;
- }
- $this->seenThisCrawl[$seenKey] = true;
-
- $normalizedPath = $this->normalizePdfUrl($url);
- if ($normalizedPath === null) {
- continue;
- }
-
- $absolutePath = $this->getAbsolutePath($normalizedPath);
- if (!is_file($absolutePath)) {
- continue;
- }
-
- $mtime = (int) (filemtime($absolutePath) ?: 0);
- $checksum = md5($normalizedPath . '|' . $mtime);
-
- // Titel-Priorität:
- // 1) Linktext
- // 2) PDF-Metadaten Title
- // 3) Dateiname
- $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
- $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
-
- $text = $this->parsePdf($absolutePath);
- if ($text === '') {
- continue;
- }
-
- $this->upsertPdf(
- $normalizedPath,
- $title,
- $text,
- $checksum,
- $mtime
- );
-
- } catch (\Throwable $e) {
- error_log(
- '[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
- );
+ $seenKey = md5($url);
+ if (isset($this->seenThisCrawl[$seenKey])) {
+ fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n");
+ continue;
}
+ $this->seenThisCrawl[$seenKey] = true;
+
+ $normalizedPath = $this->normalizePdfUrl($url);
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] normalizePdfUrl() → "
+ . ($normalizedPath ?? 'NULL')
+ . "\n"
+ );
+
+ if ($normalizedPath === null) {
+ fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n");
+ continue;
+ }
+
+ $absolutePath = $this->getAbsolutePath($normalizedPath);
+ fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n");
+
+ if (!is_file($absolutePath)) {
+ fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n");
+ continue;
+ }
+
+ fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n");
+
+ $mtime = (int) (filemtime($absolutePath) ?: 0);
+ $checksum = md5($normalizedPath . '|' . $mtime);
+
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n"
+ );
+
+ $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] metaTitle="
+ . ($pdfMetaTitle ?: 'NULL')
+ . "\n"
+ );
+
+ $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
+ fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n");
+
+ $text = $this->parsePdf($absolutePath);
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n"
+ );
+
+ if ($text === '') {
+ fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n");
+ continue;
+ }
+
+ fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n");
+
+ $this->upsertPdf(
+ $normalizedPath,
+ $title,
+ $text,
+ $checksum,
+ $mtime
+ );
}
}
private function normalizePdfUrl(string $url): ?string
{
- // Fall 1: direkter /files/-Pfad
+ fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n");
+
if (str_starts_with($url, '/files/') && preg_match('~\.pdf(\?.*)?$~i', $url)) {
- return preg_replace('~\?.*$~', '', $url);
+ $r = preg_replace('~\?.*$~', '', $url);
+ fwrite(STDERR, "[Meili PDF DEBUG] → direct /files path {$r}\n");
+ return $r;
}
$decoded = html_entity_decode($url);
$parts = parse_url($decoded);
- // Fall 2: absolute URL auf gleiche Site
if (
!empty($parts['path'])
&& str_starts_with($parts['path'], '/files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
+ fwrite(STDERR, "[Meili PDF DEBUG] → absolute URL path {$parts['path']}\n");
return $parts['path'];
}
- // Fall 3: Contao-Download-Link mit ?p=
if (empty($parts['query'])) {
+ fwrite(STDERR, "[Meili PDF DEBUG] → no query\n");
return null;
}
@@ -130,9 +159,12 @@ class PdfIndexService
if (!empty($query['p'])) {
$p = urldecode((string) $query['p']);
- return '/files/' . ltrim($p, '/');
+ $r = '/files/' . ltrim($p, '/');
+ fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n");
+ return $r;
}
+ fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n");
return null;
}
@@ -141,8 +173,13 @@ class PdfIndexService
return $this->projectDir . '/' . ltrim($relativePath, '/');
}
- private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
- {
+ private function upsertPdf(
+ string $url,
+ string $title,
+ string $text,
+ string $checksum,
+ int $mtime
+ ): void {
try {
Database::getInstance()
->prepare('
@@ -165,9 +202,12 @@ class PdfIndexService
$checksum,
$mtime
);
+
+ fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n");
} catch (\Throwable $e) {
- error_log(
- '[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n"
);
}
}
@@ -177,18 +217,39 @@ class PdfIndexService
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
-
$text = $this->cleanPdfContent($pdf->getText());
-
return mb_substr($text, 0, 20000);
} catch (\Throwable $e) {
- error_log(
- '[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n"
);
return '';
}
}
+ private function readPdfMetaTitle(string $absolutePath): ?string
+ {
+ try {
+ $parser = new Parser();
+ $pdf = $parser->parseFile($absolutePath);
+ $details = $pdf->getDetails();
+
+ foreach (['Title', 'title'] as $key) {
+ if (!empty($details[$key]) && is_string($details[$key])) {
+ return trim($details[$key]);
+ }
+ }
+ } catch (\Throwable $e) {
+ fwrite(
+ STDERR,
+ "[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n"
+ );
+ }
+
+ return null;
+ }
+
private function cleanPdfContent(string $text): string
{
if (class_exists(\Normalizer::class)) {
@@ -198,34 +259,8 @@ class PdfIndexService
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
- $text = str_replace(["\\'", "’", "‘"], "'", $text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);
}
-
- private function readPdfMetaTitle(string $absolutePath): ?string
- {
- try {
- $parser = new Parser();
- $pdf = $parser->parseFile($absolutePath);
-
- $details = $pdf->getDetails();
-
- foreach (['Title', 'title'] as $key) {
- if (!empty($details[$key]) && is_string($details[$key])) {
- $t = trim($details[$key]);
- if ($t !== '') {
- return $t;
- }
- }
- }
- } catch (\Throwable $e) {
- error_log(
- '[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
- );
- }
-
- return null;
- }
}
\ No newline at end of file