Bugfix
This commit is contained in:
@@ -16,98 +16,68 @@ class PdfIndexService
|
||||
public function __construct(ParameterBagInterface $params)
|
||||
{
|
||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||
$this->debug("projectDir={$this->projectDir}");
|
||||
}
|
||||
|
||||
private function debug(string $message): void
|
||||
{
|
||||
$stream = \defined('STDERR')
|
||||
? STDERR
|
||||
: fopen('php://stderr', 'wb');
|
||||
|
||||
fwrite($stream, "[Meili PDF DEBUG] {$message}\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
|
||||
*/
|
||||
public function resetTableOnce(): void
|
||||
{
|
||||
if ($this->didReset) {
|
||||
$this->debug('resetTableOnce(): already reset');
|
||||
return;
|
||||
}
|
||||
|
||||
$this->debug('resetTableOnce(): TRUNCATE tl_search_pdf');
|
||||
|
||||
$this->didReset = true;
|
||||
$this->seenThisCrawl = [];
|
||||
|
||||
try {
|
||||
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
||||
} catch (\Throwable $e) {
|
||||
$this->debug('TRUNCATE failed: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
|
||||
*/
|
||||
public function handlePdfLinks(array $pdfLinks): void
|
||||
{
|
||||
$this->debug('handlePdfLinks(): count=' . count($pdfLinks));
|
||||
|
||||
foreach ($pdfLinks as $row) {
|
||||
$url = (string) ($row['url'] ?? '');
|
||||
$linkText = $row['linkText'] ?? null;
|
||||
|
||||
$this->debug("URL={$url}");
|
||||
|
||||
if ($url === '') {
|
||||
$this->debug('→ empty URL, skip');
|
||||
continue;
|
||||
}
|
||||
|
||||
// innerhalb eines Crawls doppelte URLs vermeiden
|
||||
$seenKey = md5($url);
|
||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
||||
$this->debug('→ already processed, skip');
|
||||
continue;
|
||||
}
|
||||
$this->seenThisCrawl[$seenKey] = true;
|
||||
|
||||
$normalizedPath = $this->normalizePdfUrl($url);
|
||||
$this->debug('normalizePdfUrl() → ' . ($normalizedPath ?? 'NULL'));
|
||||
|
||||
if ($normalizedPath === null) {
|
||||
$this->debug('→ normalization failed, skip');
|
||||
continue;
|
||||
}
|
||||
|
||||
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
||||
$this->debug("absolutePath={$absolutePath}");
|
||||
|
||||
if (!is_file($absolutePath)) {
|
||||
$this->debug('→ file does NOT exist');
|
||||
continue;
|
||||
}
|
||||
|
||||
$this->debug('→ file exists');
|
||||
|
||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||
$checksum = md5($normalizedPath . '|' . $mtime);
|
||||
|
||||
$this->debug("mtime={$mtime} checksum={$checksum}");
|
||||
|
||||
// Titel-Priorität:
|
||||
// 1) Linktext
|
||||
// 2) PDF-Metadaten
|
||||
// 3) Dateiname
|
||||
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
||||
$this->debug('metaTitle=' . ($pdfMetaTitle ?: 'NULL'));
|
||||
|
||||
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
||||
$this->debug("final title={$title}");
|
||||
|
||||
$text = $this->parsePdf($absolutePath);
|
||||
$this->debug('parsed text length=' . strlen($text));
|
||||
|
||||
if ($text === '') {
|
||||
$this->debug('→ empty text, skip');
|
||||
continue;
|
||||
}
|
||||
|
||||
$this->debug('→ writing to DB');
|
||||
|
||||
$this->upsertPdf(
|
||||
$normalizedPath,
|
||||
$title,
|
||||
@@ -120,48 +90,52 @@ class PdfIndexService
|
||||
|
||||
private function normalizePdfUrl(string $url): ?string
|
||||
{
|
||||
$this->debug("normalizePdfUrl(): {$url}");
|
||||
|
||||
$decoded = html_entity_decode($url);
|
||||
$parts = parse_url($decoded);
|
||||
|
||||
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
||||
$r = '/' . $parts['path'];
|
||||
$this->debug("→ relative files path {$r}");
|
||||
return $r;
|
||||
// 1) files/...pdf (ohne führenden Slash)
|
||||
if (
|
||||
!empty($parts['path'])
|
||||
&& str_starts_with($parts['path'], 'files/')
|
||||
&& str_ends_with(strtolower($parts['path']), '.pdf')
|
||||
) {
|
||||
return '/' . $parts['path'];
|
||||
}
|
||||
|
||||
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
||||
$this->debug("→ absolute files path {$parts['path']}");
|
||||
// 2) /files/...pdf
|
||||
if (
|
||||
!empty($parts['path'])
|
||||
&& str_starts_with($parts['path'], '/files/')
|
||||
&& str_ends_with(strtolower($parts['path']), '.pdf')
|
||||
) {
|
||||
return $parts['path'];
|
||||
}
|
||||
|
||||
if (empty($parts['query'])) {
|
||||
$this->debug('→ no query');
|
||||
return null;
|
||||
}
|
||||
|
||||
parse_str($parts['query'], $query);
|
||||
|
||||
// 3) Contao 4: ?file=files/...
|
||||
if (!empty($query['file'])) {
|
||||
$file = urldecode((string) $query['file']);
|
||||
$file = ltrim($file, '/');
|
||||
|
||||
if (str_starts_with($file, 'files/') && str_ends_with(strtolower($file), '.pdf')) {
|
||||
$r = '/' . $file;
|
||||
$this->debug("→ file= normalized {$r}");
|
||||
return $r;
|
||||
if (
|
||||
str_starts_with($file, 'files/')
|
||||
&& str_ends_with(strtolower($file), '.pdf')
|
||||
) {
|
||||
return '/' . $file;
|
||||
}
|
||||
}
|
||||
|
||||
// 4) Contao 5: ?p=...
|
||||
if (!empty($query['p'])) {
|
||||
$p = urldecode((string) $query['p']);
|
||||
$r = '/files/' . ltrim($p, '/');
|
||||
$this->debug("→ p= normalized {$r}");
|
||||
return $r;
|
||||
return '/files/' . ltrim($p, '/');
|
||||
}
|
||||
|
||||
$this->debug('→ no usable parameter');
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -177,7 +151,6 @@ class PdfIndexService
|
||||
string $checksum,
|
||||
int $mtime
|
||||
): void {
|
||||
try {
|
||||
Database::getInstance()
|
||||
->prepare('
|
||||
INSERT INTO tl_search_pdf
|
||||
@@ -199,11 +172,6 @@ class PdfIndexService
|
||||
$checksum,
|
||||
$mtime
|
||||
);
|
||||
|
||||
$this->debug('→ DB write OK');
|
||||
} catch (\Throwable $e) {
|
||||
$this->debug('DB write failed: ' . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private function parsePdf(string $absolutePath): string
|
||||
@@ -212,9 +180,9 @@ class PdfIndexService
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($absolutePath);
|
||||
$text = $this->cleanPdfContent($pdf->getText());
|
||||
|
||||
return mb_substr($text, 0, 20000);
|
||||
} catch (\Throwable $e) {
|
||||
$this->debug('parsePdf failed: ' . $e->getMessage());
|
||||
} catch (\Throwable) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
@@ -228,11 +196,13 @@ class PdfIndexService
|
||||
|
||||
foreach (['Title', 'title'] as $key) {
|
||||
if (!empty($details[$key]) && is_string($details[$key])) {
|
||||
return trim($details[$key]);
|
||||
$t = trim($details[$key]);
|
||||
if ($t !== '') {
|
||||
return $t;
|
||||
}
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
$this->debug('readPdfMetaTitle failed: ' . $e->getMessage());
|
||||
}
|
||||
} catch (\Throwable) {
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user