This commit is contained in:
Jürgen Mummert
2025-12-25 21:52:22 +01:00
parent 6e6f5904d9
commit bbb4d5cc6c
2 changed files with 127 additions and 41 deletions
+11 -2
View File
@@ -132,13 +132,22 @@ class IndexPageListener
private function findPdfLinks(string $content): array private function findPdfLinks(string $content): array
{ {
if (!preg_match_all( if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i', '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
$content, $content,
$matches $matches
)) { )) {
return []; return [];
} }
return array_unique(array_map('html_entity_decode', $matches[1])); $result = [];
foreach ($matches[1] as $i => $href) {
$result[] = [
'url' => html_entity_decode($href),
'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
];
}
return $result;
} }
} }
+114 -37
View File
@@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class PdfIndexService class PdfIndexService
{ {
private bool $tableReset = false;
private string $projectDir; private string $projectDir;
private bool $crawlStarted = false;
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
} }
/** /* =====================================================
* 🔥 Wird bei JEDEM Crawl einmal aufgerufen * Crawl-Start (immer aufrufen!)
*/ * ===================================================== */
public function resetTableOnce(): void public function startCrawl(): void
{ {
if ($this->tableReset) { if ($this->crawlStarted) {
return; return;
} }
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf'); $this->crawlStarted = true;
error_log('tl_search_pdf wurde geleert');
$this->tableReset = true; // bewusst simpel: bei JEDEM Crawl komplett leeren
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
error_log('PDF Crawl gestartet → tl_search_pdf geleert');
} }
/** /* =====================================================
* Einstiegspunkt vom Listener * Einstiegspunkt aus IndexPageListener
*/ * ===================================================== */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
foreach ($pdfLinks as $url) { foreach ($pdfLinks as $pdf) {
try { try {
$path = $this->normalizePdfUrl($url); $url = $pdf['url'];
if ($path === null) { $linkText = $pdf['linkText'] ?? null;
error_log('bearbeite PDF: ' . $url);
$relativePath = $this->normalizePdfUrl($url);
if ($relativePath === null) {
error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue; continue;
} }
$absolutePath = $this->projectDir . '/' . ltrim($path, '/'); $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht');
continue; continue;
} }
$parser = new Parser(); $mtime = filemtime($absolutePath) ?: 0;
$pdf = $parser->parseFile($absolutePath); $checksum = md5($relativePath . $mtime);
$text = $this->cleanPdfContent($pdf->getText());
// PDF parsen
[$text, $metaTitle] = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: kein Textinhalt');
continue; continue;
} }
Database::getInstance() // TITEL-PRIORITÄT
->prepare(' $title =
INSERT INTO tl_search_pdf $linkText
(tstamp, url, title, text, checksum, file_mtime) ?: $metaTitle
VALUES (?, ?, ?, ?, ?, ?) ?: basename($absolutePath);
')
->execute( $this->insertPdf(
time(), $relativePath,
$path, $title,
basename($absolutePath), $text,
mb_substr($text, 0, 5000), $checksum,
md5($path), $mtime
filemtime($absolutePath) ?: 0
); );
error_log('→ geschrieben in tl_search_pdf');
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Fehler: ' . $e->getMessage()); error_log('PDF Service FEHLER: ' . $e->getMessage());
} }
} }
} }
@@ -82,18 +95,22 @@ class PdfIndexService
* ===================================================== */ * ===================================================== */
private function normalizePdfUrl(string $url): ?string private function normalizePdfUrl(string $url): ?string
{ {
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) { // direkter /files-Link
if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) {
return $url; return $url;
} }
$parts = parse_url(html_entity_decode($url)); // Contao-Download-Link (?p=)
$decoded = html_entity_decode($url);
$parts = parse_url($decoded);
if (!isset($parts['query'])) { if (!isset($parts['query'])) {
return null; return null;
} }
parse_str($parts['query'], $query); parse_str($parts['query'], $query);
if (!empty($query['p'])) { if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) {
return '/files/' . ltrim($query['p'], '/'); return '/files/' . ltrim($query['p'], '/');
} }
@@ -101,16 +118,76 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* Textbereinigung * DB
* ===================================================== */
private function insertPdf(
string $url,
string $title,
string $text,
string $checksum,
int $mtime
): void {
Database::getInstance()
->prepare('
INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime)
VALUES (?, ?, ?, ?, ?, ?)
')
->execute(
time(),
$url,
$title,
$text,
$checksum,
$mtime
);
}
/* =====================================================
* PDF Parsing
* ===================================================== */
private function parsePdf(string $absolutePath): array
{
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails();
$metaTitle = $details['Title'] ?? null;
$text = $this->cleanPdfContent($pdf->getText());
return [
mb_substr($text, 0, 5000),
is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null,
];
} catch (\Throwable $e) {
error_log('PDF Parser FEHLER: ' . $e->getMessage());
return ['', null];
}
}
/* =====================================================
* Text-Bereinigung
* ===================================================== */ * ===================================================== */
private function cleanPdfContent(string $text): string private function cleanPdfContent(string $text): string
{ {
// Unicode normalisieren
if (class_exists(\Normalizer::class)) { if (class_exists(\Normalizer::class)) {
$text = \Normalizer::normalize($text, \Normalizer::FORM_C); $text = \Normalizer::normalize($text, \Normalizer::FORM_C);
} }
// Sonderglyphen entfernen (Noten, Steuerzeichen etc.)
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
// falsche Worttrennungen ("ges pielt")
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text); $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
// Apostrophe vereinheitlichen
$text = str_replace(["\\'", "", ""], "'", $text);
// Mehrfach-Leerzeichen
$text = preg_replace('/\s+/u', ' ', $text); $text = preg_replace('/\s+/u', ' ', $text);
return trim($text); return trim($text);