This commit is contained in:
Jürgen Mummert
2025-12-28 11:56:23 +01:00
parent 2a22253f18
commit 04f3e76c8f
+39 -69
View File
@@ -16,98 +16,68 @@ class PdfIndexService
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
$this->debug("projectDir={$this->projectDir}");
}
private function debug(string $message): void
{
$stream = \defined('STDERR')
? STDERR
: fopen('php://stderr', 'wb');
fwrite($stream, "[Meili PDF DEBUG] {$message}\n");
} }
/**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
*/
public function resetTableOnce(): void public function resetTableOnce(): void
{ {
if ($this->didReset) { if ($this->didReset) {
$this->debug('resetTableOnce(): already reset');
return; return;
} }
$this->debug('resetTableOnce(): TRUNCATE tl_search_pdf');
$this->didReset = true; $this->didReset = true;
$this->seenThisCrawl = []; $this->seenThisCrawl = [];
try {
Database::getInstance()->execute('TRUNCATE tl_search_pdf'); Database::getInstance()->execute('TRUNCATE tl_search_pdf');
} catch (\Throwable $e) {
$this->debug('TRUNCATE failed: ' . $e->getMessage());
}
} }
/**
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
*/
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
$this->debug('handlePdfLinks(): count=' . count($pdfLinks));
foreach ($pdfLinks as $row) { foreach ($pdfLinks as $row) {
$url = (string) ($row['url'] ?? ''); $url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null; $linkText = $row['linkText'] ?? null;
$this->debug("URL={$url}");
if ($url === '') { if ($url === '') {
$this->debug('→ empty URL, skip');
continue; continue;
} }
// innerhalb eines Crawls doppelte URLs vermeiden
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($this->seenThisCrawl[$seenKey])) {
$this->debug('→ already processed, skip');
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $this->seenThisCrawl[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
$this->debug('normalizePdfUrl() → ' . ($normalizedPath ?? 'NULL'));
if ($normalizedPath === null) { if ($normalizedPath === null) {
$this->debug('→ normalization failed, skip');
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($normalizedPath); $absolutePath = $this->getAbsolutePath($normalizedPath);
$this->debug("absolutePath={$absolutePath}");
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
$this->debug('→ file does NOT exist');
continue; continue;
} }
$this->debug('→ file exists');
$mtime = (int) (filemtime($absolutePath) ?: 0); $mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($normalizedPath . '|' . $mtime); $checksum = md5($normalizedPath . '|' . $mtime);
$this->debug("mtime={$mtime} checksum={$checksum}"); // Titel-Priorität:
// 1) Linktext
// 2) PDF-Metadaten
// 3) Dateiname
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
$this->debug('metaTitle=' . ($pdfMetaTitle ?: 'NULL'));
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
$this->debug("final title={$title}");
$text = $this->parsePdf($absolutePath); $text = $this->parsePdf($absolutePath);
$this->debug('parsed text length=' . strlen($text));
if ($text === '') { if ($text === '') {
$this->debug('→ empty text, skip');
continue; continue;
} }
$this->debug('→ writing to DB');
$this->upsertPdf( $this->upsertPdf(
$normalizedPath, $normalizedPath,
$title, $title,
@@ -120,48 +90,52 @@ class PdfIndexService
private function normalizePdfUrl(string $url): ?string private function normalizePdfUrl(string $url): ?string
{ {
$this->debug("normalizePdfUrl(): {$url}");
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/') && str_ends_with(strtolower($parts['path']), '.pdf')) { // 1) files/...pdf (ohne führenden Slash)
$r = '/' . $parts['path']; if (
$this->debug("→ relative files path {$r}"); !empty($parts['path'])
return $r; && str_starts_with($parts['path'], 'files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
return '/' . $parts['path'];
} }
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) { // 2) /files/...pdf
$this->debug("→ absolute files path {$parts['path']}"); if (
!empty($parts['path'])
&& str_starts_with($parts['path'], '/files/')
&& str_ends_with(strtolower($parts['path']), '.pdf')
) {
return $parts['path']; return $parts['path'];
} }
if (empty($parts['query'])) { if (empty($parts['query'])) {
$this->debug('→ no query');
return null; return null;
} }
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
// 3) Contao 4: ?file=files/...
if (!empty($query['file'])) { if (!empty($query['file'])) {
$file = urldecode((string) $query['file']); $file = urldecode((string) $query['file']);
$file = ltrim($file, '/'); $file = ltrim($file, '/');
if (str_starts_with($file, 'files/') && str_ends_with(strtolower($file), '.pdf')) { if (
$r = '/' . $file; str_starts_with($file, 'files/')
$this->debug("→ file= normalized {$r}"); && str_ends_with(strtolower($file), '.pdf')
return $r; ) {
return '/' . $file;
} }
} }
// 4) Contao 5: ?p=...
if (!empty($query['p'])) { if (!empty($query['p'])) {
$p = urldecode((string) $query['p']); $p = urldecode((string) $query['p']);
$r = '/files/' . ltrim($p, '/'); return '/files/' . ltrim($p, '/');
$this->debug("→ p= normalized {$r}");
return $r;
} }
$this->debug('→ no usable parameter');
return null; return null;
} }
@@ -177,7 +151,6 @@ class PdfIndexService
string $checksum, string $checksum,
int $mtime int $mtime
): void { ): void {
try {
Database::getInstance() Database::getInstance()
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
@@ -199,11 +172,6 @@ class PdfIndexService
$checksum, $checksum,
$mtime $mtime
); );
$this->debug('→ DB write OK');
} catch (\Throwable $e) {
$this->debug('DB write failed: ' . $e->getMessage());
}
} }
private function parsePdf(string $absolutePath): string private function parsePdf(string $absolutePath): string
@@ -212,9 +180,9 @@ class PdfIndexService
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseFile($absolutePath); $pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText()); $text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 20000); return mb_substr($text, 0, 20000);
} catch (\Throwable $e) { } catch (\Throwable) {
$this->debug('parsePdf failed: ' . $e->getMessage());
return ''; return '';
} }
} }
@@ -228,11 +196,13 @@ class PdfIndexService
foreach (['Title', 'title'] as $key) { foreach (['Title', 'title'] as $key) {
if (!empty($details[$key]) && is_string($details[$key])) { if (!empty($details[$key]) && is_string($details[$key])) {
return trim($details[$key]); $t = trim($details[$key]);
if ($t !== '') {
return $t;
} }
} }
} catch (\Throwable $e) { }
$this->debug('readPdfMetaTitle failed: ' . $e->getMessage()); } catch (\Throwable) {
} }
return null; return null;