This commit is contained in:
Jürgen Mummert
2025-12-25 15:53:35 +01:00
parent 6d86faa62f
commit 85cbf2a235
2 changed files with 52 additions and 108 deletions
+11 -17
View File
@@ -11,6 +11,12 @@ class IndexPageListener
public function onIndexPage(string $content, array &$data, array &$set): void public function onIndexPage(string $content, array &$data, array &$set): void
{ {
// 🔑 Service einmal pro Crawl initialisieren + Reset ausführen
if ($this->pdfIndexService === null) {
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
$this->pdfIndexService->resetTableOnce();
}
// Marker vorhanden? // Marker vorhanden?
if (!str_contains($content, 'MEILISEARCH_JSON')) { if (!str_contains($content, 'MEILISEARCH_JSON')) {
return; return;
@@ -53,9 +59,7 @@ class IndexPageListener
} }
foreach (preg_split('/\s+/', trim($src)) as $word) { foreach (preg_split('/\s+/', trim($src)) as $word) {
if ($word !== '') { $keywords[] = $word;
$keywords[] = $word;
}
} }
} }
@@ -101,17 +105,9 @@ class IndexPageListener
*/ */
$pdfLinks = $this->findPdfLinks($content); $pdfLinks = $this->findPdfLinks($content);
if ($pdfLinks === []) { if ($pdfLinks !== []) {
return; $this->pdfIndexService->handlePdfLinks($pdfLinks);
} }
// Service lazy aus Container holen
if ($this->pdfIndexService === null) {
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
$this->pdfIndexService->resetTableOnce();
}
$this->pdfIndexService->handlePdfLinks($pdfLinks);
} }
/* ===================================================== /* =====================================================
@@ -130,7 +126,7 @@ class IndexPageListener
} }
/* ===================================================== /* =====================================================
* PDF-Links im Markup finden * PDF-Links im HTML finden
* ===================================================== */ * ===================================================== */
private function findPdfLinks(string $content): array private function findPdfLinks(string $content): array
{ {
@@ -142,8 +138,6 @@ class IndexPageListener
return []; return [];
} }
return array_unique( return array_unique(array_map('html_entity_decode', $matches[1]));
array_map('html_entity_decode', $matches[1])
);
} }
} }
+41 -91
View File
@@ -8,17 +8,17 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class PdfIndexService class PdfIndexService
{ {
private string $projectDir;
private bool $tableReset = false; private bool $tableReset = false;
private string $projectDir;
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
} }
/* ===================================================== /**
* Reset tl_search_pdf einmal pro Crawl * 🔥 Wird bei JEDEM Crawl einmal aufgerufen
* ===================================================== */ */
public function resetTableOnce(): void public function resetTableOnce(): void
{ {
if ($this->tableReset) { if ($this->tableReset) {
@@ -26,50 +26,53 @@ class PdfIndexService
} }
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
$this->tableReset = true; error_log('tl_search_pdf wurde geleert');
error_log('PDF Reset: tl_search_pdf geleert'); $this->tableReset = true;
} }
/* ===================================================== /**
* Einstiegspunkt aus Listener * Einstiegspunkt vom Listener
* ===================================================== */ */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
foreach ($pdfLinks as $url) { foreach ($pdfLinks as $url) {
try { try {
$normalizedPath = $this->normalizePdfUrl($url); $path = $this->normalizePdfUrl($url);
if ($normalizedPath === null) { if ($path === null) {
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($normalizedPath); $absolutePath = $this->projectDir . '/' . ltrim($path, '/');
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
continue; continue;
} }
$mtime = filemtime($absolutePath) ?: 0; $parser = new Parser();
$checksum = md5($normalizedPath . $mtime); $pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText());
if ($this->alreadyIndexed($checksum)) {
continue;
}
$text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
continue; continue;
} }
$this->insertPdf( Database::getInstance()
$normalizedPath, ->prepare('
basename($absolutePath), INSERT INTO tl_search_pdf
$text, (tstamp, url, title, text, checksum, file_mtime)
$checksum, VALUES (?, ?, ?, ?, ?, ?)
$mtime ')
); ->execute(
time(),
$path,
basename($absolutePath),
mb_substr($text, 0, 5000),
md5($path),
filemtime($absolutePath) ?: 0
);
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Service Fehler: ' . $e->getMessage()); error_log('PDF Fehler: ' . $e->getMessage());
} }
} }
} }
@@ -83,9 +86,7 @@ class PdfIndexService
return $url; return $url;
} }
$decoded = html_entity_decode($url); $parts = parse_url(html_entity_decode($url));
$parts = parse_url($decoded);
if (!isset($parts['query'])) { if (!isset($parts['query'])) {
return null; return null;
} }
@@ -100,69 +101,18 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* Pfade * Textbereinigung
* ===================================================== */ * ===================================================== */
private function getAbsolutePath(string $relativePath): string private function cleanPdfContent(string $text): string
{ {
return $this->projectDir . '/' . ltrim($relativePath, '/'); if (class_exists(\Normalizer::class)) {
} $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
/* =====================================================
* DB
* ===================================================== */
private function alreadyIndexed(string $checksum): bool
{
$result = Database::getInstance()
->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?')
->execute($checksum);
return $result->numRows > 0;
}
private function insertPdf(
string $path,
string $title,
string $text,
string $checksum,
int $mtime
): void {
Database::getInstance()
->prepare('
INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime)
VALUES (?, ?, ?, ?, ?, ?)
')
->execute(
time(),
$path,
$title,
$text,
$checksum,
$mtime
);
}
/* =====================================================
* PDF-Parsing
* ===================================================== */
private function parsePdf(string $absolutePath): string
{
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$text = $pdf->getText();
if (class_exists(\Normalizer::class)) {
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
}
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}]/u', ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim(mb_substr($text, 0, 5000));
} catch (\Throwable) {
return '';
} }
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);
} }
} }