This commit is contained in:
Jürgen Mummert
2025-12-25 21:58:51 +01:00
parent 5b1d68eb06
commit d7b9174f32
+89 -66
View File
@@ -10,16 +10,11 @@ class PdfIndexService
{ {
private string $projectDir; private string $projectDir;
/** /** @var bool */
* Merkt sich Checksums innerhalb eines Crawls private bool $crawlInitialized = false;
* → verhindert Duplicate INSERTs
*/
private array $processedChecksums = [];
/** /** @var array<string, bool> */
* Flag, damit das Reset nur 1× pro Crawl passiert private array $processedChecksums = [];
*/
private bool $resetDone = false;
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
@@ -27,30 +22,22 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* Crawl-Start: Tabelle leeren * PUBLIC API
* ===================================================== */ * ===================================================== */
public function startCrawl(): void
{
if ($this->resetDone) {
return;
}
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); /**
* Einstiegspunkt aus dem IndexPageListener
$this->processedChecksums = []; *
$this->resetDone = true; * @param array<int,array{url:string,text?:string|null}> $pdfLinks
*/
error_log('PDF Crawl Start → tl_search_pdf geleert');
}
/* =====================================================
* Einstiegspunkt aus dem Listener
* ===================================================== */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
// 🔴 WICHTIG: Reset garantiert VOR dem ersten INSERT
$this->initializeCrawl();
foreach ($pdfLinks as $pdf) { foreach ($pdfLinks as $pdf) {
try { try {
$url = $pdf['url']; $url = $pdf['url'];
$linkText = $pdf['text'] ?? null; $linkText = $pdf['text'] ?? null;
error_log('bearbeite PDF: ' . $url); error_log('bearbeite PDF: ' . $url);
@@ -61,30 +48,36 @@ class PdfIndexService
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($relativePath); $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht'); error_log('→ übersprungen: Datei existiert nicht');
continue; continue;
} }
// Datei-Zeitstempel
$mtime = filemtime($absolutePath) ?: 0; $mtime = filemtime($absolutePath) ?: 0;
$checksum = md5($relativePath . $mtime);
// Stabiler Crawl-Checksum
$checksum = md5($relativePath . '|' . $mtime);
// Pro Crawl deduplizieren
if (isset($this->processedChecksums[$checksum])) { if (isset($this->processedChecksums[$checksum])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet'); error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue; continue;
} }
$this->processedChecksums[$checksum] = true; $this->processedChecksums[$checksum] = true;
$title = $this->resolveTitle($linkText, $absolutePath); // Titel bestimmen
$text = $this->parsePdf($absolutePath); $title = $this->resolveTitle($absolutePath, $linkText);
// PDF parsen
$text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: PDF ohne Textinhalt'); error_log('→ übersprungen: PDF ohne Textinhalt');
continue; continue;
} }
// Schreiben
$this->insertPdf( $this->insertPdf(
$relativePath, $relativePath,
$title, $title,
@@ -97,47 +90,43 @@ class PdfIndexService
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Service FEHLER: ' . $e->getMessage()); error_log('PDF Service FEHLER: ' . $e->getMessage());
error_log($e->getTraceAsString());
} }
} }
} }
/* ===================================================== /* =====================================================
* Titel-Ermittlung (Prio!) * CRAWL-LIFECYCLE
* ===================================================== */ * ===================================================== */
private function resolveTitle(?string $linkText, string $absolutePath): string
private function initializeCrawl(): void
{ {
if (is_string($linkText) && trim($linkText) !== '') { if ($this->crawlInitialized) {
return trim(strip_tags($linkText)); return;
} }
try { $this->crawlInitialized = true;
$parser = new Parser(); $this->processedChecksums = [];
$pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails();
if (!empty($details['Title'])) { Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
return trim((string) $details['Title']);
}
} catch (\Throwable) {
// ignorieren
}
return basename($absolutePath); error_log('PDF Crawl initialisiert → tl_search_pdf geleert');
} }
/* ===================================================== /* =====================================================
* URL → relativer /files-Pfad * URL-NORMALISIERUNG
* ===================================================== */ * ===================================================== */
private function normalizePdfUrl(string $url): ?string private function normalizePdfUrl(string $url): ?string
{ {
// direkter /files-Link // Direkter /files-Link
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) { if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
return $url; return $url;
} }
// Contao Download-Link (?p=pdf/...) // Contao Hash-/Download-Link (?p=)
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!isset($parts['query'])) { if (!isset($parts['query'])) {
return null; return null;
@@ -153,32 +142,53 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* relativer → absoluter Pfad * TITEL-AUFLÖSUNG
* ===================================================== */ * ===================================================== */
private function getAbsolutePath(string $relativePath): string
private function resolveTitle(string $absolutePath, ?string $linkText): string
{ {
return $this->projectDir . '/' . ltrim($relativePath, '/'); // 1. Linktext aus HTML
if (is_string($linkText) && trim($linkText) !== '') {
return trim($linkText);
}
// 2. PDF-Metadaten
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails();
if (!empty($details['Title'])) {
return trim((string) $details['Title']);
}
} catch (\Throwable) {
// ignorieren
}
// 3. Fallback: Dateiname
return basename($absolutePath);
} }
/* ===================================================== /* =====================================================
* DB INSERT * DB
* ===================================================== */ * ===================================================== */
private function insertPdf( private function insertPdf(
string $path, string $url,
string $title, string $title,
string $text, string $text,
string $checksum, string $checksum,
int $mtime int $mtime
): void { ): void {
Database::getInstance() Database::getInstance()
->prepare(' ->prepare(
INSERT INTO tl_search_pdf 'INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, url, title, text, checksum, file_mtime)
VALUES (?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?)'
') )
->execute( ->execute(
time(), time(),
$path, $url,
$title, $title,
$text, $text,
$checksum, $checksum,
@@ -187,16 +197,20 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* PDF-Parsing + Cleanup * PDF PARSING
* ===================================================== */ * ===================================================== */
private function parsePdf(string $absolutePath): string private function parsePdf(string $absolutePath): string
{ {
try { try {
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseFile($absolutePath); $pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText()); $text = $this->cleanPdfContent($pdf->getText());
// Begrenzen (Performance + Relevanz)
return mb_substr($text, 0, 5000); return mb_substr($text, 0, 5000);
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Parser FEHLER: ' . $e->getMessage()); error_log('PDF Parser FEHLER: ' . $e->getMessage());
return ''; return '';
@@ -209,10 +223,19 @@ class PdfIndexService
$text = \Normalizer::normalize($text, \Normalizer::FORM_C); $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
} }
// Sonderglyphen raus
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
$text = str_replace(["\\'", "", ""], "'", $text); // Worttrennungen reparieren
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
// Apostrophe normalisieren
$text = str_replace(["\\'", '', ''], "'", $text);
// Mehrfache Satzzeichen
$text = preg_replace('/([.,;:!?])\1+/', '$1', $text); $text = preg_replace('/([.,;:!?])\1+/', '$1', $text);
// Whitespaces
$text = preg_replace('/\s+/u', ' ', $text); $text = preg_replace('/\s+/u', ' ', $text);
return trim($text); return trim($text);