This commit is contained in:
Jürgen Mummert
2025-12-28 11:29:13 +01:00
parent e4c8473999
commit 9edb1e4713
2 changed files with 241 additions and 160 deletions
+129 -94
View File
@@ -10,119 +10,148 @@ class PdfIndexService
{
private string $projectDir;
// pro PHP-Process genau 1x resetten
private bool $didReset = false;
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params)
{
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n");
}
/**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
*/
public function resetTableOnce(): void
{
if ($this->didReset) {
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n");
return;
}
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n");
$this->didReset = true;
$this->seenThisCrawl = [];
try {
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
} catch (\Throwable $e) {
error_log('[ContaoMeilisearch] PDF reset failed: ' . $e->getMessage());
fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n");
}
}
/**
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
*/
public function handlePdfLinks(array $pdfLinks): void
{
fwrite(
STDERR,
"[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n"
);
foreach ($pdfLinks as $row) {
$url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null;
fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n");
if ($url === '') {
fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n");
continue;
}
try {
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) {
continue;
}
$this->seenThisCrawl[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url);
if ($normalizedPath === null) {
continue;
}
$absolutePath = $this->getAbsolutePath($normalizedPath);
if (!is_file($absolutePath)) {
continue;
}
$mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($normalizedPath . '|' . $mtime);
// Titel-Priorität:
// 1) Linktext
// 2) PDF-Metadaten Title
// 3) Dateiname
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
$text = $this->parsePdf($absolutePath);
if ($text === '') {
continue;
}
$this->upsertPdf(
$normalizedPath,
$title,
$text,
$checksum,
$mtime
);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] PDF indexing failed for "' . $url . '": ' . $e->getMessage()
);
$seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) {
fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n");
continue;
}
$this->seenThisCrawl[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url);
fwrite(
STDERR,
"[Meili PDF DEBUG] normalizePdfUrl() → "
. ($normalizedPath ?? 'NULL')
. "\n"
);
if ($normalizedPath === null) {
fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n");
continue;
}
$absolutePath = $this->getAbsolutePath($normalizedPath);
fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n");
if (!is_file($absolutePath)) {
fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n");
continue;
}
fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n");
$mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($normalizedPath . '|' . $mtime);
fwrite(
STDERR,
"[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n"
);
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
fwrite(
STDERR,
"[Meili PDF DEBUG] metaTitle="
. ($pdfMetaTitle ?: 'NULL')
. "\n"
);
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n");
$text = $this->parsePdf($absolutePath);
fwrite(
STDERR,
"[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n"
);
if ($text === '') {
fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n");
continue;
}
fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n");
$this->upsertPdf(
$normalizedPath,
$title,
$text,
$checksum,
$mtime
);
}
}
private function normalizePdfUrl(string $url): ?string
{
// Fall 1: direkter /files/-Pfad
fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n");
if (str_starts_with($url, '/files/') && preg_match('~\.pdf(\?.*)?$~i', $url)) {
return preg_replace('~\?.*$~', '', $url);
$r = preg_replace('~\?.*$~', '', $url);
fwrite(STDERR, "[Meili PDF DEBUG] → direct /files path {$r}\n");
return $r;
}
$decoded = html_entity_decode($url);
$parts = parse_url($decoded);
// Fall 2: absolute URL auf gleiche Site
if (
!empty($parts['path'])
&& str_starts_with($parts['path'], '/files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
fwrite(STDERR, "[Meili PDF DEBUG] → absolute URL path {$parts['path']}\n");
return $parts['path'];
}
// Fall 3: Contao-Download-Link mit ?p=
if (empty($parts['query'])) {
fwrite(STDERR, "[Meili PDF DEBUG] → no query\n");
return null;
}
@@ -130,9 +159,12 @@ class PdfIndexService
if (!empty($query['p'])) {
$p = urldecode((string) $query['p']);
return '/files/' . ltrim($p, '/');
$r = '/files/' . ltrim($p, '/');
fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n");
return $r;
}
fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n");
return null;
}
@@ -141,8 +173,13 @@ class PdfIndexService
return $this->projectDir . '/' . ltrim($relativePath, '/');
}
private function upsertPdf(string $url, string $title, string $text, string $checksum, int $mtime): void
{
private function upsertPdf(
string $url,
string $title,
string $text,
string $checksum,
int $mtime
): void {
try {
Database::getInstance()
->prepare('
@@ -165,9 +202,12 @@ class PdfIndexService
$checksum,
$mtime
);
fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n");
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write PDF index entry (' . $url . '): ' . $e->getMessage()
fwrite(
STDERR,
"[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n"
);
}
}
@@ -177,18 +217,39 @@ class PdfIndexService
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 20000);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to parse PDF "' . $absolutePath . '": ' . $e->getMessage()
fwrite(
STDERR,
"[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n"
);
return '';
}
}
private function readPdfMetaTitle(string $absolutePath): ?string
{
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails();
foreach (['Title', 'title'] as $key) {
if (!empty($details[$key]) && is_string($details[$key])) {
return trim($details[$key]);
}
}
} catch (\Throwable $e) {
fwrite(
STDERR,
"[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n"
);
}
return null;
}
private function cleanPdfContent(string $text): string
{
if (class_exists(\Normalizer::class)) {
@@ -198,34 +259,8 @@ class PdfIndexService
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
$text = str_replace(["\\'", "", ""], "'", $text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);
}
private function readPdfMetaTitle(string $absolutePath): ?string
{
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails();
foreach (['Title', 'title'] as $key) {
if (!empty($details[$key]) && is_string($details[$key])) {
$t = trim($details[$key]);
if ($t !== '') {
return $t;
}
}
}
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to read PDF metadata "' . $absolutePath . '": ' . $e->getMessage()
);
}
return null;
}
}