This commit is contained in:
Jürgen Mummert
2025-12-25 15:47:17 +01:00
parent 798aaadb7e
commit 6d86faa62f
4 changed files with 45 additions and 127 deletions
+5 -5
View File
@@ -101,18 +101,18 @@ class IndexPageListener
*/ */
$pdfLinks = $this->findPdfLinks($content); $pdfLinks = $this->findPdfLinks($content);
if ($pdfLinks !== []) { if ($pdfLinks === []) {
error_log('PDF gefunden'); return;
}
// PdfIndexService lazy aus dem Container holen // Service lazy aus Container holen
if ($this->pdfIndexService === null) { if ($this->pdfIndexService === null) {
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class); $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
$this->pdfIndexService->resetTableOnce();
} }
$this->pdfIndexService->startCrawl();
$this->pdfIndexService->handlePdfLinks($pdfLinks); $this->pdfIndexService->handlePdfLinks($pdfLinks);
} }
}
/* ===================================================== /* =====================================================
* JSON aus Marker extrahieren * JSON aus Marker extrahieren
-18
View File
@@ -1,18 +0,0 @@
<?php
namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService;
class PdfCleanupListener
{
public function __construct(
private PdfIndexService $pdfIndexService
) {}
public function onLastChunk(): void
{
error_log('Crawler beendet → PDF Cleanup startet');
$this->pdfIndexService->cleanupRemovedPdfs();
}
}
-4
View File
@@ -10,7 +10,3 @@ services:
MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener: MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener:
tags: tags:
- { name: contao.hook, hook: indexPage, method: onIndexPage } - { name: contao.hook, hook: indexPage, method: onIndexPage }
MummertMedia\ContaoMeilisearchBundle\EventListener\PdfCleanupListener:
tags:
- { name: kernel.event_listener, event: terminal42.escargot.last_chunk, method: onLastChunk }
+32 -92
View File
@@ -8,63 +8,43 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class PdfIndexService class PdfIndexService
{ {
private int $crawlStart = 0;
public function startCrawl(): void
{
if ($this->crawlStart === 0) {
$this->crawlStart = time();
error_log('PDF Crawl Start: ' . $this->crawlStart);
}
}
public function cleanupRemovedPdfs(): void
{
if ($this->crawlStart === 0) {
return;
}
Database::getInstance()
->prepare('DELETE FROM tl_search_pdf WHERE tstamp < ?')
->execute($this->crawlStart);
error_log('PDF Cleanup abgeschlossen');
}
private string $projectDir; private string $projectDir;
private bool $tableReset = false;
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
// Contao 5 / Symfony-konform
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
} }
/** /* =====================================================
* Einstiegspunkt aus dem IndexPageListener * Reset tl_search_pdf einmal pro Crawl
*/ * ===================================================== */
public function resetTableOnce(): void
{
if ($this->tableReset) {
return;
}
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
$this->tableReset = true;
error_log('PDF Reset: tl_search_pdf geleert');
}
/* =====================================================
* Einstiegspunkt aus Listener
* ===================================================== */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
error_log('PDF Service aufgerufen');
error_log('PDF Links Count: ' . count($pdfLinks));
error_log('PDF Links: ' . json_encode($pdfLinks, JSON_UNESCAPED_SLASHES));
foreach ($pdfLinks as $url) { foreach ($pdfLinks as $url) {
try { try {
error_log('bearbeite PDF: ' . $url);
$normalizedPath = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
error_log('umgewandelte URL: ' . var_export($normalizedPath, true));
if ($normalizedPath === null) { if ($normalizedPath === null) {
error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($normalizedPath); $absolutePath = $this->getAbsolutePath($normalizedPath);
error_log('absoluter Pfad: ' . var_export($absolutePath, true));
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht');
continue; continue;
} }
@@ -72,32 +52,24 @@ class PdfIndexService
$checksum = md5($normalizedPath . $mtime); $checksum = md5($normalizedPath . $mtime);
if ($this->alreadyIndexed($checksum)) { if ($this->alreadyIndexed($checksum)) {
error_log('→ übersprungen: bereits indexiert');
continue; continue;
} }
$title = basename($absolutePath);
error_log('gefundener Title: ' . $title);
$text = $this->parsePdf($absolutePath); $text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: PDF ohne Textinhalt');
continue; continue;
} }
$this->insertPdf( $this->insertPdf(
$normalizedPath, $normalizedPath,
$title, basename($absolutePath),
$text, $text,
$checksum, $checksum,
$mtime $mtime
); );
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Service FEHLER (pro PDF): ' . $e->getMessage()); error_log('PDF Service Fehler: ' . $e->getMessage());
error_log($e->getTraceAsString());
} }
} }
} }
@@ -107,15 +79,13 @@ class PdfIndexService
* ===================================================== */ * ===================================================== */
private function normalizePdfUrl(string $url): ?string private function normalizePdfUrl(string $url): ?string
{ {
// Fall 1: direkter /files/-Pfad
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) { if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
return $url; return $url;
} }
// Fall 2: Contao-Download-Link mit ?p=
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!isset($parts['query'])) { if (!isset($parts['query'])) {
return null; return null;
} }
@@ -123,7 +93,6 @@ class PdfIndexService
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
if (!empty($query['p'])) { if (!empty($query['p'])) {
// Contao speichert Pfade relativ zu /files
return '/files/' . ltrim($query['p'], '/'); return '/files/' . ltrim($query['p'], '/');
} }
@@ -131,7 +100,7 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* relativer Pfad → absoluter Pfad * Pfade
* ===================================================== */ * ===================================================== */
private function getAbsolutePath(string $relativePath): string private function getAbsolutePath(string $relativePath): string
{ {
@@ -139,13 +108,11 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* DB-Helfer * DB
* ===================================================== */ * ===================================================== */
private function alreadyIndexed(string $checksum): bool private function alreadyIndexed(string $checksum): bool
{ {
$db = Database::getInstance(); $result = Database::getInstance()
$result = $db
->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?') ->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?')
->execute($checksum); ->execute($checksum);
@@ -159,9 +126,7 @@ class PdfIndexService
string $checksum, string $checksum,
int $mtime int $mtime
): void { ): void {
$db = Database::getInstance(); Database::getInstance()
$db
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, url, title, text, checksum, file_mtime)
@@ -186,43 +151,18 @@ class PdfIndexService
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseFile($absolutePath); $pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText()); $text = $pdf->getText();
// bewusst begrenzen (Performance + Relevanz)
return mb_substr($text, 0, 5000);
} catch (\Throwable $e) {
error_log('PDF Parser FEHLER: ' . $e->getMessage());
return '';
}
}
private function cleanPdfContent(string $text): string
{
// 1. Unicode normalisieren (wichtig!)
if (class_exists(\Normalizer::class)) { if (class_exists(\Normalizer::class)) {
$text = \Normalizer::normalize($text, \Normalizer::FORM_C); $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
} }
// 2. Musik- & Spezialglyphen entfernen $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}]/u', ' ', $text);
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
// 3. Falsche Worttrennungen reparieren: "ges pielt" → "gespielt"
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
// 4. Spezielle PDF-Apostrophe reparieren
$text = str_replace(
["\\'", "", ""],
"'",
$text
);
// 5. Mehrfache Satzzeichen bereinigen
$text = preg_replace('/([.,;:!?])\1+/', '$1', $text);
// 6. Überflüssige Leerzeichen & Zeilenumbrüche
$text = preg_replace('/\s+/u', ' ', $text); $text = preg_replace('/\s+/u', ' ', $text);
return trim($text); return trim(mb_substr($text, 0, 5000));
} catch (\Throwable) {
return '';
}
} }
} }