Bugfix
This commit is contained in:
@@ -15,18 +15,92 @@ class IndexPageListener
|
||||
|
||||
public function onIndexPage(string $content, array &$data, array &$set): void
|
||||
{
|
||||
fwrite(STDERR, "\n[Meili DEBUG] onIndexPage() called\n");
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* PDF: Reset genau 1× pro Crawl
|
||||
* =====================
|
||||
*/
|
||||
try {
|
||||
fwrite(STDERR, "[Meili DEBUG] resetTableOnce()\n");
|
||||
$this->pdfIndexService->resetTableOnce();
|
||||
} catch (\Throwable $e) {
|
||||
fwrite(STDERR, "[Meili DEBUG] PDF reset failed: {$e->getMessage()}\n");
|
||||
error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* SEITEN-METADATEN
|
||||
* =====================
|
||||
*/
|
||||
if (str_contains($content, 'MEILISEARCH_JSON')) {
|
||||
try {
|
||||
$parsed = $this->extractMeilisearchJson($content);
|
||||
} catch (\Throwable $e) {
|
||||
error_log('[ContaoMeilisearch] Failed to extract MEILISEARCH_JSON: ' . $e->getMessage());
|
||||
$parsed = null;
|
||||
}
|
||||
|
||||
if (is_array($parsed)) {
|
||||
|
||||
// PRIORITY
|
||||
$priority =
|
||||
$parsed['event']['priority']
|
||||
?? $parsed['news']['priority']
|
||||
?? $parsed['page']['priority']
|
||||
?? null;
|
||||
|
||||
if ($priority !== null && $priority !== '') {
|
||||
$set['priority'] = (int) $priority;
|
||||
}
|
||||
|
||||
// KEYWORDS
|
||||
$keywordSources = [
|
||||
$parsed['event']['keywords'] ?? null,
|
||||
$parsed['news']['keywords'] ?? null,
|
||||
$parsed['page']['keywords'] ?? null,
|
||||
];
|
||||
|
||||
$keywords = [];
|
||||
foreach ($keywordSources as $src) {
|
||||
if (!is_string($src) || trim($src) === '') {
|
||||
continue;
|
||||
}
|
||||
foreach (preg_split('/\s+/', trim($src)) as $word) {
|
||||
$keywords[] = $word;
|
||||
}
|
||||
}
|
||||
|
||||
if ($keywords) {
|
||||
$set['keywords'] = implode(' ', array_unique($keywords));
|
||||
}
|
||||
|
||||
// IMAGEPATH
|
||||
if (!empty($parsed['page']['searchimage'])) {
|
||||
$set['imagepath'] = trim((string) $parsed['page']['searchimage']);
|
||||
}
|
||||
|
||||
// STARTDATE
|
||||
$startDate =
|
||||
$parsed['event']['startDate']
|
||||
?? $parsed['news']['startDate']
|
||||
?? null;
|
||||
|
||||
if (is_numeric($startDate) && (int) $startDate > 0) {
|
||||
$set['startDate'] = (int) $startDate;
|
||||
}
|
||||
|
||||
// CHECKSUM
|
||||
try {
|
||||
$checksumSeed = (string) ($data['checksum'] ?? '');
|
||||
$checksumSeed .= '|' . ($set['keywords'] ?? '');
|
||||
$checksumSeed .= '|' . ($set['priority'] ?? '');
|
||||
$checksumSeed .= '|' . ($set['imagepath'] ?? '');
|
||||
$checksumSeed .= '|' . ($set['startDate'] ?? '');
|
||||
|
||||
$set['checksum'] = md5($checksumSeed);
|
||||
} catch (\Throwable $e) {
|
||||
error_log('[ContaoMeilisearch] Failed to generate checksum: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -35,91 +109,67 @@ class IndexPageListener
|
||||
* =====================
|
||||
*/
|
||||
if ((int) ($data['protected'] ?? 0) !== 0) {
|
||||
fwrite(STDERR, "[Meili DEBUG] Page is protected → skip files\n");
|
||||
return;
|
||||
}
|
||||
|
||||
$indexPdfs = (bool) Config::get('meilisearch_index_pdfs');
|
||||
$indexOffice = (bool) Config::get('meilisearch_index_office');
|
||||
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili DEBUG] Settings: pdfs="
|
||||
. ($indexPdfs ? '1' : '0')
|
||||
. " office="
|
||||
. ($indexOffice ? '1' : '0')
|
||||
. "\n"
|
||||
);
|
||||
|
||||
if (!$indexPdfs && !$indexOffice) {
|
||||
fwrite(STDERR, "[Meili DEBUG] No file indexing enabled → return\n");
|
||||
return;
|
||||
}
|
||||
|
||||
$links = $this->findAllLinks($content);
|
||||
fwrite(STDERR, "[Meili DEBUG] Found " . count($links) . " <a> links\n");
|
||||
|
||||
$pdfLinks = [];
|
||||
$officeLinks = [];
|
||||
|
||||
foreach ($links as $link) {
|
||||
fwrite(STDERR, "[Meili DEBUG] URL: {$link['url']}\n");
|
||||
|
||||
$type = $this->detectIndexableFileType($link['url']);
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili DEBUG] → detected type: "
|
||||
. ($type ?? 'none')
|
||||
. "\n"
|
||||
);
|
||||
|
||||
if ($type === 'pdf') {
|
||||
if ($indexPdfs) {
|
||||
fwrite(STDERR, "[Meili DEBUG] → add to PDF queue\n");
|
||||
if ($type === 'pdf' && $indexPdfs) {
|
||||
$pdfLinks[] = $link;
|
||||
} else {
|
||||
fwrite(STDERR, "[Meili DEBUG] → PDF indexing disabled\n");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (in_array($type, ['docx', 'xlsx', 'pptx'], true)) {
|
||||
if ($indexOffice) {
|
||||
fwrite(STDERR, "[Meili DEBUG] → add to OFFICE queue\n");
|
||||
if (
|
||||
in_array($type, ['docx', 'xlsx', 'pptx'], true)
|
||||
&& $indexOffice
|
||||
) {
|
||||
$officeLinks[] = $link;
|
||||
} else {
|
||||
fwrite(STDERR, "[Meili DEBUG] → Office indexing disabled\n");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "[Meili DEBUG] → ignored\n");
|
||||
}
|
||||
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili DEBUG] Final queues: pdf="
|
||||
. count($pdfLinks)
|
||||
. " office="
|
||||
. count($officeLinks)
|
||||
. "\n"
|
||||
);
|
||||
|
||||
try {
|
||||
if ($pdfLinks !== []) {
|
||||
fwrite(STDERR, "[Meili DEBUG] Calling handlePdfLinks()\n");
|
||||
$this->pdfIndexService->handlePdfLinks($pdfLinks);
|
||||
}
|
||||
|
||||
if ($officeLinks !== []) {
|
||||
fwrite(STDERR, "[Meili DEBUG] Calling handleOfficeLinks()\n");
|
||||
$this->officeIndexService->handleOfficeLinks($officeLinks);
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
fwrite(STDERR, "[Meili DEBUG] File indexing failed: {$e->getMessage()}\n");
|
||||
error_log('[ContaoMeilisearch] File indexing failed: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrahiert MEILISEARCH_JSON aus HTML-Kommentar
|
||||
*/
|
||||
private function extractMeilisearchJson(string $content): ?array
|
||||
{
|
||||
if (!preg_match('/<!--\s*MEILISEARCH_JSON\s*(\{.*?\})\s*-->/s', $content, $m)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$json = preg_replace('/^\xEF\xBB\xBF/', '', trim($m[1]));
|
||||
$data = json_decode($json, true);
|
||||
|
||||
return json_last_error() === JSON_ERROR_NONE && is_array($data)
|
||||
? $data
|
||||
: null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sammle alle <a href="…"> Links
|
||||
*/
|
||||
@@ -150,39 +200,35 @@ class IndexPageListener
|
||||
*/
|
||||
private function detectIndexableFileType(string $url): ?string
|
||||
{
|
||||
fwrite(STDERR, "[Meili DEBUG] detectIndexableFileType(): $url\n");
|
||||
|
||||
// Hash entfernen
|
||||
$url = strtok($url, '#');
|
||||
$parts = parse_url($url);
|
||||
|
||||
$parts = parse_url($url);
|
||||
if (!$parts) {
|
||||
fwrite(STDERR, "[Meili DEBUG] → parse_url failed\n");
|
||||
return null;
|
||||
}
|
||||
|
||||
// direkter Pfad (/files/…)
|
||||
if (!empty($parts['path'])) {
|
||||
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
||||
fwrite(STDERR, "[Meili DEBUG] → path ext: $ext\n");
|
||||
|
||||
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
|
||||
return $ext;
|
||||
}
|
||||
}
|
||||
|
||||
// Query-Parameter (Contao 4 + 5)
|
||||
if (!empty($parts['query'])) {
|
||||
parse_str($parts['query'], $query);
|
||||
|
||||
foreach (['file', 'p', 'f'] as $param) {
|
||||
if (!empty($query[$param])) {
|
||||
$candidate = rawurldecode(
|
||||
html_entity_decode((string) $query[$param], ENT_QUOTES)
|
||||
);
|
||||
$candidate = (string) $query[$param];
|
||||
|
||||
// sicher decodieren (Contao 4 + 5)
|
||||
$candidate = html_entity_decode($candidate, ENT_QUOTES);
|
||||
$candidate = rawurldecode($candidate);
|
||||
|
||||
$ext = strtolower(pathinfo($candidate, PATHINFO_EXTENSION));
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili DEBUG] → query $param=$candidate ext=$ext\n"
|
||||
);
|
||||
|
||||
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx'], true)) {
|
||||
return $ext;
|
||||
|
||||
@@ -10,82 +10,114 @@ class PdfIndexService
|
||||
{
|
||||
private string $projectDir;
|
||||
|
||||
// pro PHP-Process genau 1x resetten
|
||||
private bool $didReset = false;
|
||||
|
||||
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
|
||||
private array $seenThisCrawl = [];
|
||||
|
||||
public function __construct(ParameterBagInterface $params)
|
||||
{
|
||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
|
||||
*/
|
||||
public function resetTableOnce(): void
|
||||
{
|
||||
if ($this->didReset) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n");
|
||||
|
||||
$this->didReset = true;
|
||||
$this->seenThisCrawl = [];
|
||||
|
||||
try {
|
||||
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
||||
} catch (\Throwable $e) {
|
||||
error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
|
||||
*/
|
||||
public function handlePdfLinks(array $pdfLinks): void
|
||||
{
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n"
|
||||
);
|
||||
|
||||
foreach ($pdfLinks as $row) {
|
||||
$url = (string) ($row['url'] ?? '');
|
||||
$linkText = $row['linkText'] ?? null;
|
||||
|
||||
fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n");
|
||||
|
||||
if ($url === '') {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
|
||||
$seenKey = md5($url);
|
||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n");
|
||||
continue;
|
||||
}
|
||||
$this->seenThisCrawl[$seenKey] = true;
|
||||
|
||||
$normalizedPath = $this->normalizePdfUrl($url);
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] normalizePdfUrl() → "
|
||||
. ($normalizedPath ?? 'NULL')
|
||||
. "\n"
|
||||
);
|
||||
|
||||
if ($normalizedPath === null) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n");
|
||||
|
||||
if (!is_file($absolutePath)) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n");
|
||||
|
||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||
$checksum = md5($normalizedPath . '|' . $mtime);
|
||||
|
||||
// Titel-Priorität:
|
||||
// 1) Linktext
|
||||
// 2) PDF-Metadaten Title
|
||||
// 3) Dateiname
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n"
|
||||
);
|
||||
|
||||
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] metaTitle="
|
||||
. ($pdfMetaTitle ?: 'NULL')
|
||||
. "\n"
|
||||
);
|
||||
|
||||
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n");
|
||||
|
||||
$text = $this->parsePdf($absolutePath);
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n"
|
||||
);
|
||||
|
||||
if ($text === '') {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n");
|
||||
|
||||
$this->upsertPdf(
|
||||
$normalizedPath,
|
||||
$title,
|
||||
@@ -93,36 +125,33 @@ class PdfIndexService
|
||||
$checksum,
|
||||
$mtime
|
||||
);
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function normalizePdfUrl(string $url): ?string
|
||||
{
|
||||
// Fall 1: direkter /files/-Pfad
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n");
|
||||
|
||||
if (str_starts_with($url, '/files/') && preg_match('~\.pdf(\?.*)?$~i', $url)) {
|
||||
return preg_replace('~\?.*$~', '', $url);
|
||||
$r = preg_replace('~\?.*$~', '', $url);
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → direct /files path {$r}\n");
|
||||
return $r;
|
||||
}
|
||||
|
||||
$decoded = html_entity_decode($url);
|
||||
$parts = parse_url($decoded);
|
||||
|
||||
// Fall 2: absolute URL auf gleiche Site
|
||||
if (
|
||||
!empty($parts['path'])
|
||||
&& str_starts_with($parts['path'], '/files/')
|
||||
&& str_ends_with(strtolower($parts['path']), '.pdf')
|
||||
) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → absolute URL path {$parts['path']}\n");
|
||||
return $parts['path'];
|
||||
}
|
||||
|
||||
// Fall 3: Contao-Download-Link mit ?p=
|
||||
if (empty($parts['query'])) {
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → no query\n");
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -130,9 +159,12 @@ class PdfIndexService
|
||||
|
||||
if (!empty($query['p'])) {
|
||||
$p = urldecode((string) $query['p']);
|
||||
return '/files/' . ltrim($p, '/');
|
||||
$r = '/files/' . ltrim($p, '/');
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n");
|
||||
return $r;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n");
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -141,8 +173,13 @@ class PdfIndexService
|
||||
return $this->projectDir . '/' . ltrim($relativePath, '/');
|
||||
}
|
||||
|
||||
private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
|
||||
{
|
||||
private function upsertPdf(
|
||||
string $url,
|
||||
string $title,
|
||||
string $text,
|
||||
string $checksum,
|
||||
int $mtime
|
||||
): void {
|
||||
try {
|
||||
Database::getInstance()
|
||||
->prepare('
|
||||
@@ -165,9 +202,12 @@ class PdfIndexService
|
||||
$checksum,
|
||||
$mtime
|
||||
);
|
||||
|
||||
fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n");
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -177,18 +217,39 @@ class PdfIndexService
|
||||
try {
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($absolutePath);
|
||||
|
||||
$text = $this->cleanPdfContent($pdf->getText());
|
||||
|
||||
return mb_substr($text, 0, 20000);
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n"
|
||||
);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
private function readPdfMetaTitle(string $absolutePath): ?string
|
||||
{
|
||||
try {
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($absolutePath);
|
||||
$details = $pdf->getDetails();
|
||||
|
||||
foreach (['Title', 'title'] as $key) {
|
||||
if (!empty($details[$key]) && is_string($details[$key])) {
|
||||
return trim($details[$key]);
|
||||
}
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
fwrite(
|
||||
STDERR,
|
||||
"[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n"
|
||||
);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private function cleanPdfContent(string $text): string
|
||||
{
|
||||
if (class_exists(\Normalizer::class)) {
|
||||
@@ -198,34 +259,8 @@ class PdfIndexService
|
||||
$text = str_replace(["\r\n", "\r"], "\n", $text);
|
||||
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
||||
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
|
||||
$text = str_replace(["\\'", "’", "‘"], "'", $text);
|
||||
$text = preg_replace('/\s+/u', ' ', $text);
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
private function readPdfMetaTitle(string $absolutePath): ?string
|
||||
{
|
||||
try {
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($absolutePath);
|
||||
|
||||
$details = $pdf->getDetails();
|
||||
|
||||
foreach (['Title', 'title'] as $key) {
|
||||
if (!empty($details[$key]) && is_string($details[$key])) {
|
||||
$t = trim($details[$key]);
|
||||
if ($t !== '') {
|
||||
return $t;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
|
||||
);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user