diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index ffdebd1..672879d 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -132,13 +132,22 @@ class IndexPageListener
private function findPdfLinks(string $content): array
{
if (!preg_match_all(
- '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i',
+ '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
return [];
}
- return array_unique(array_map('html_entity_decode', $matches[1]));
+ $result = [];
+
+ foreach ($matches[1] as $i => $href) {
+ $result[] = [
+ 'url' => html_entity_decode($href),
+ 'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
+ ];
+ }
+
+ return $result;
}
}
\ No newline at end of file
diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php
index 3f826f8..d7b7970 100644
--- a/src/Service/PdfIndexService.php
+++ b/src/Service/PdfIndexService.php
@@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class PdfIndexService
{
- private bool $tableReset = false;
private string $projectDir;
+ private bool $crawlStarted = false;
public function __construct(ParameterBagInterface $params)
{
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
}
- /**
- * 🔥 Wird bei JEDEM Crawl einmal aufgerufen
- */
- public function resetTableOnce(): void
+ /* =====================================================
+ * Crawl-Start (immer aufrufen!)
+ * ===================================================== */
+ public function startCrawl(): void
{
- if ($this->tableReset) {
+ if ($this->crawlStarted) {
return;
}
- Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
- error_log('tl_search_pdf wurde geleert');
+ $this->crawlStarted = true;
- $this->tableReset = true;
+ // bewusst simpel: bei JEDEM Crawl komplett leeren
+ Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
+
+ error_log('PDF Crawl gestartet → tl_search_pdf geleert');
}
- /**
- * Einstiegspunkt vom Listener
- */
+ /* =====================================================
+ * Einstiegspunkt aus IndexPageListener
+ * ===================================================== */
public function handlePdfLinks(array $pdfLinks): void
{
- foreach ($pdfLinks as $url) {
+ foreach ($pdfLinks as $pdf) {
try {
- $path = $this->normalizePdfUrl($url);
- if ($path === null) {
+ $url = $pdf['url'];
+ $linkText = $pdf['linkText'] ?? null;
+
+ error_log('bearbeite PDF: ' . $url);
+
+ $relativePath = $this->normalizePdfUrl($url);
+ if ($relativePath === null) {
+ error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue;
}
- $absolutePath = $this->projectDir . '/' . ltrim($path, '/');
+ $absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
if (!is_file($absolutePath)) {
+ error_log('→ übersprungen: Datei existiert nicht');
continue;
}
- $parser = new Parser();
- $pdf = $parser->parseFile($absolutePath);
- $text = $this->cleanPdfContent($pdf->getText());
+ $mtime = filemtime($absolutePath) ?: 0;
+ $checksum = md5($relativePath . $mtime);
+
+ // PDF parsen
+ [$text, $metaTitle] = $this->parsePdf($absolutePath);
if ($text === '') {
+ error_log('→ übersprungen: kein Textinhalt');
continue;
}
- Database::getInstance()
- ->prepare('
- INSERT INTO tl_search_pdf
- (tstamp, url, title, text, checksum, file_mtime)
- VALUES (?, ?, ?, ?, ?, ?)
- ')
- ->execute(
- time(),
- $path,
- basename($absolutePath),
- mb_substr($text, 0, 5000),
- md5($path),
- filemtime($absolutePath) ?: 0
- );
+ // TITEL-PRIORITÄT
+ $title =
+ $linkText
+ ?: $metaTitle
+ ?: basename($absolutePath);
+
+ $this->insertPdf(
+ $relativePath,
+ $title,
+ $text,
+ $checksum,
+ $mtime
+ );
+
+ error_log('→ geschrieben in tl_search_pdf');
} catch (\Throwable $e) {
- error_log('PDF Fehler: ' . $e->getMessage());
+ error_log('PDF Service FEHLER: ' . $e->getMessage());
}
}
}
@@ -82,18 +95,22 @@ class PdfIndexService
* ===================================================== */
private function normalizePdfUrl(string $url): ?string
{
- if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
+ // direkter /files-Link
+ if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) {
return $url;
}
- $parts = parse_url(html_entity_decode($url));
+ // Contao-Download-Link (?p=)
+ $decoded = html_entity_decode($url);
+ $parts = parse_url($decoded);
+
if (!isset($parts['query'])) {
return null;
}
parse_str($parts['query'], $query);
- if (!empty($query['p'])) {
+ if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) {
return '/files/' . ltrim($query['p'], '/');
}
@@ -101,16 +118,76 @@ class PdfIndexService
}
/* =====================================================
- * Textbereinigung
+ * DB
+ * ===================================================== */
+ private function insertPdf(
+ string $url,
+ string $title,
+ string $text,
+ string $checksum,
+ int $mtime
+ ): void {
+ Database::getInstance()
+ ->prepare('
+ INSERT INTO tl_search_pdf
+ (tstamp, url, title, text, checksum, file_mtime)
+ VALUES (?, ?, ?, ?, ?, ?)
+ ')
+ ->execute(
+ time(),
+ $url,
+ $title,
+ $text,
+ $checksum,
+ $mtime
+ );
+ }
+
+ /* =====================================================
+ * PDF Parsing
+ * ===================================================== */
+ private function parsePdf(string $absolutePath): array
+ {
+ try {
+ $parser = new Parser();
+ $pdf = $parser->parseFile($absolutePath);
+
+ $details = $pdf->getDetails();
+ $metaTitle = $details['Title'] ?? null;
+
+ $text = $this->cleanPdfContent($pdf->getText());
+
+ return [
+ mb_substr($text, 0, 5000),
+ is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null,
+ ];
+
+ } catch (\Throwable $e) {
+ error_log('PDF Parser FEHLER: ' . $e->getMessage());
+ return ['', null];
+ }
+ }
+
+ /* =====================================================
+ * Text-Bereinigung
* ===================================================== */
private function cleanPdfContent(string $text): string
{
+ // Unicode normalisieren
if (class_exists(\Normalizer::class)) {
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
}
+ // Sonderglyphen entfernen (Noten, Steuerzeichen etc.)
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
- $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
+
+ // falsche Worttrennungen ("ges pielt")
+ $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
+
+ // Apostrophe vereinheitlichen
+ $text = str_replace(["\\'", "’", "‘"], "'", $text);
+
+ // Mehrfach-Leerzeichen
$text = preg_replace('/\s+/u', ' ', $text);
return trim($text);