This commit is contained in:
Jürgen Mummert
2025-12-28 11:25:21 +01:00
parent 5102f98e2b
commit e4c8473999
+65 -111
View File
@@ -15,92 +15,18 @@ class IndexPageListener
public function onIndexPage(string $content, array &$data, array &$set): void public function onIndexPage(string $content, array &$data, array &$set): void
{ {
fwrite(STDERR, "\n[Meili DEBUG] onIndexPage() called\n");
/* /*
* ===================== * =====================
* PDF: Reset genau 1× pro Crawl * PDF: Reset genau 1× pro Crawl
* ===================== * =====================
*/ */
try { try {
fwrite(STDERR, "[Meili DEBUG] resetTableOnce()\n");
$this->pdfIndexService->resetTableOnce(); $this->pdfIndexService->resetTableOnce();
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage()); fwrite(STDERR, "[Meili DEBUG] PDF reset failed: {$e->getMessage()}\n");
}
/*
* =====================
* SEITEN-METADATEN
* =====================
*/
if (str_contains($content, 'MEILISEARCH_JSON')) {
try {
$parsed = $this->extractMeilisearchJson($content);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to extract MEILISEARCH_JSON: ' . $e->getMessage());
$parsed = null;
}
if (is_array($parsed)) {
// PRIORITY
$priority =
$parsed['event']['priority']
?? $parsed['news']['priority']
?? $parsed['page']['priority']
?? null;
if ($priority !== null && $priority !== '') {
$set['priority'] = (int) $priority;
}
// KEYWORDS
$keywordSources = [
$parsed['event']['keywords'] ?? null,
$parsed['news']['keywords'] ?? null,
$parsed['page']['keywords'] ?? null,
];
$keywords = [];
foreach ($keywordSources as $src) {
if (!is_string($src) || trim($src) === '') {
continue;
}
foreach (preg_split('/\s+/', trim($src)) as $word) {
$keywords[] = $word;
}
}
if ($keywords) {
$set['keywords'] = implode(' ', array_unique($keywords));
}
// IMAGEPATH
if (!empty($parsed['page']['searchimage'])) {
$set['imagepath'] = trim((string) $parsed['page']['searchimage']);
}
// STARTDATE
$startDate =
$parsed['event']['startDate']
?? $parsed['news']['startDate']
?? null;
if (is_numeric($startDate) && (int) $startDate > 0) {
$set['startDate'] = (int) $startDate;
}
// CHECKSUM
try {
$checksumSeed = (string) ($data['checksum'] ?? '');
$checksumSeed .= '|' . ($set['keywords'] ?? '');
$checksumSeed .= '|' . ($set['priority'] ?? '');
$checksumSeed .= '|' . ($set['imagepath'] ?? '');
$checksumSeed .= '|' . ($set['startDate'] ?? '');
$set['checksum'] = md5($checksumSeed);
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] Failed to generate checksum: ' . $e->getMessage());
}
}
} }
/* /*
@@ -109,67 +35,91 @@ class IndexPageListener
* ===================== * =====================
*/ */
if ((int) ($data['protected'] ?? 0) !== 0) { if ((int) ($data['protected'] ?? 0) !== 0) {
fwrite(STDERR, "[Meili DEBUG] Page is protected → skip files\n");
return; return;
} }
$indexPdfs = (bool) Config::get('meilisearch_index_pdfs'); $indexPdfs = (bool) Config::get('meilisearch_index_pdfs');
$indexOffice = (bool) Config::get('meilisearch_index_office'); $indexOffice = (bool) Config::get('meilisearch_index_office');
fwrite(
STDERR,
"[Meili DEBUG] Settings: pdfs="
. ($indexPdfs ? '1' : '0')
. " office="
. ($indexOffice ? '1' : '0')
. "\n"
);
if (!$indexPdfs && !$indexOffice) { if (!$indexPdfs && !$indexOffice) {
fwrite(STDERR, "[Meili DEBUG] No file indexing enabled → return\n");
return; return;
} }
$links = $this->findAllLinks($content); $links = $this->findAllLinks($content);
fwrite(STDERR, "[Meili DEBUG] Found " . count($links) . " <a> links\n");
$pdfLinks = []; $pdfLinks = [];
$officeLinks = []; $officeLinks = [];
foreach ($links as $link) { foreach ($links as $link) {
$type = $this->detectIndexableFileType($link['url']); fwrite(STDERR, "[Meili DEBUG] URL: {$link['url']}\n");
if ($type === 'pdf' && $indexPdfs) { $type = $this->detectIndexableFileType($link['url']);
fwrite(
STDERR,
"[Meili DEBUG] → detected type: "
. ($type ?? 'none')
. "\n"
);
if ($type === 'pdf') {
if ($indexPdfs) {
fwrite(STDERR, "[Meili DEBUG] → add to PDF queue\n");
$pdfLinks[] = $link; $pdfLinks[] = $link;
} else {
fwrite(STDERR, "[Meili DEBUG] → PDF indexing disabled\n");
}
continue; continue;
} }
if ( if (in_array($type, ['docx', 'xlsx', 'pptx'], true)) {
in_array($type, ['docx', 'xlsx', 'pptx'], true) if ($indexOffice) {
&& $indexOffice fwrite(STDERR, "[Meili DEBUG] → add to OFFICE queue\n");
) {
$officeLinks[] = $link; $officeLinks[] = $link;
} else {
fwrite(STDERR, "[Meili DEBUG] → Office indexing disabled\n");
} }
continue;
} }
fwrite(STDERR, "[Meili DEBUG] → ignored\n");
}
fwrite(
STDERR,
"[Meili DEBUG] Final queues: pdf="
. count($pdfLinks)
. " office="
. count($officeLinks)
. "\n"
);
try { try {
if ($pdfLinks !== []) { if ($pdfLinks !== []) {
fwrite(STDERR, "[Meili DEBUG] Calling handlePdfLinks()\n");
$this->pdfIndexService->handlePdfLinks($pdfLinks); $this->pdfIndexService->handlePdfLinks($pdfLinks);
} }
if ($officeLinks !== []) { if ($officeLinks !== []) {
fwrite(STDERR, "[Meili DEBUG] Calling handleOfficeLinks()\n");
$this->officeIndexService->handleOfficeLinks($officeLinks); $this->officeIndexService->handleOfficeLinks($officeLinks);
} }
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage()); fwrite(STDERR, "[Meili DEBUG] File indexing failed: {$e->getMessage()}\n");
} }
} }
/**
* Extrahiert MEILISEARCH_JSON aus HTML-Kommentar
*/
private function extractMeilisearchJson(string $content): ?array
{
if (!preg_match('/<!--\s*MEILISEARCH_JSON\s*(\{.*?\})\s*-->/s', $content, $m)) {
return null;
}
$json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
$data = json_decode($json, true);
return json_last_error() === JSON_ERROR_NONE && is_array($data)
? $data
: null;
}
/** /**
* Sammle alle <a href="…"> Links * Sammle alle <a href="…"> Links
*/ */
@@ -200,35 +150,39 @@ class IndexPageListener
*/ */
private function detectIndexableFileType(string $url): ?string private function detectIndexableFileType(string $url): ?string
{ {
// Hash entfernen fwrite(STDERR, "[Meili DEBUG] detectIndexableFileType(): $url\n");
$url = strtok($url, '#');
$url = strtok($url, '#');
$parts = parse_url($url); $parts = parse_url($url);
if (!$parts) { if (!$parts) {
fwrite(STDERR, "[Meili DEBUG] → parse_url failed\n");
return null; return null;
} }
// direkter Pfad (/files/…)
if (!empty($parts['path'])) { if (!empty($parts['path'])) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
fwrite(STDERR, "[Meili DEBUG] → path ext: $ext\n");
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext; return $ext;
} }
} }
// Query-Parameter (Contao 4 + 5)
if (!empty($parts['query'])) { if (!empty($parts['query'])) {
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
foreach (['file', 'p', 'f'] as $param) { foreach (['file', 'p', 'f'] as $param) {
if (!empty($query[$param])) { if (!empty($query[$param])) {
$candidate = (string) $query[$param]; $candidate = rawurldecode(
html_entity_decode((string) $query[$param], ENT_QUOTES)
// sicher decodieren (Contao 4 + 5) );
$candidate = html_entity_decode($candidate, ENT_QUOTES);
$candidate = rawurldecode($candidate);
$ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
fwrite(
STDERR,
"[Meili DEBUG] → query $param=$candidate ext=$ext\n"
);
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
return $ext; return $ext;