diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index 27dc811..1cb02cf 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -2,6 +2,8 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
+use Contao\Database;
+use Contao\StringUtil;
use Smalot\PdfParser\Parser;
class IndexPageListener
@@ -13,16 +15,15 @@ class IndexPageListener
return;
}
- // JSON aus Kommentar extrahieren + parsen
+ // JSON aus Kommentar extrahieren
$parsed = $this->extractMeilisearchJson($content);
-
if ($parsed === null) {
return;
}
/*
* =====================
- * PRIORITY (event > news > page)
+ * PRIORITY
* =====================
*/
$priority =
@@ -36,7 +37,7 @@ class IndexPageListener
/*
* =====================
- * KEYWORDS (merge)
+ * KEYWORDS
* =====================
*/
$keywordSources = [
@@ -45,26 +46,26 @@ class IndexPageListener
$parsed['page']['keywords'] ?? null,
];
- $kw = [];
- foreach ($keywordSources as $s) {
- if (!is_string($s) || trim($s) === '') {
+ $keywords = [];
+ foreach ($keywordSources as $src) {
+ if (!is_string($src) || trim($src) === '') {
continue;
}
- foreach (preg_split('/\s+/', trim($s)) ?: [] as $p) {
- if ($p !== '') {
- $kw[] = $p;
+ foreach (preg_split('/\s+/', trim($src)) as $word) {
+ if ($word !== '') {
+ $keywords[] = $word;
}
}
}
- if ($kw) {
- $set['keywords'] = implode(' ', array_unique($kw));
+ if ($keywords) {
+ $set['keywords'] = implode(' ', array_unique($keywords));
}
/*
* =====================
- * IMAGEPATH (event > news > page > custom)
+ * IMAGEPATH
* =====================
*/
$image =
@@ -79,7 +80,7 @@ class IndexPageListener
/*
* =====================
- * STARTDATE (event/news)
+ * STARTDATE
* =====================
*/
$date =
@@ -95,20 +96,16 @@ class IndexPageListener
/*
* =====================
- * PDF LINKS INDEXIEREN
+ * PDFS ALS EIGENE DOKUMENTE INDEXIEREN
* =====================
*/
- $pdfText = $this->extractPdfTextFromContent($content);
-
- if ($pdfText !== '') {
- $set['text'] = trim(
- ($set['text'] ?? '') . "\n\n" . $pdfText
- );
- }
+ $this->indexPdfLinks($content);
}
/**
- * Extrahiert das JSON aus
+ * -------------------------------------
+ * JSON aus Kommentar extrahieren
+ * -------------------------------------
*/
private function extractMeilisearchJson(string $content): ?array
{
@@ -123,39 +120,36 @@ class IndexPageListener
}
/**
- * Findet PDF-Links im HTML und extrahiert deren Text
+ * -------------------------------------
+ * PDF-Links finden und indexieren
+ * -------------------------------------
*/
- private function extractPdfTextFromContent(string $content): string
+ private function indexPdfLinks(string $content): void
{
if (!preg_match_all(
- '/]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>/i',
+ '/]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
- return '';
+ return;
}
- $texts = [];
-
- foreach ($matches[1] as $url) {
- $pdfText = $this->parsePdfFromUrl($url);
-
- if ($pdfText !== '') {
- $texts[] = $pdfText;
- }
+ foreach ($matches[1] as $i => $url) {
+ $title = trim(strip_tags($matches[2][$i])) ?: basename($url);
+ $this->indexSinglePdf($url, $title);
}
-
- return implode("\n\n", $texts);
}
/**
- * Lädt und parsed ein PDF (nur /files/)
+ * -------------------------------------
+ * Einzelnes PDF indexieren
+ * -------------------------------------
*/
- private function parsePdfFromUrl(string $url): string
+ private function indexSinglePdf(string $url, string $title): void
{
- // Nur interne PDFs
+ // nur interne PDFs
if (!str_contains($url, '/files/')) {
- return '';
+ return;
}
// relative URLs normalisieren
@@ -166,19 +160,62 @@ class IndexPageListener
. $url;
}
+ $checksum = md5($url);
+ $db = Database::getInstance();
+
+ // bereits indexiert?
+ $exists = $db
+ ->prepare('SELECT id FROM tl_search WHERE checksum=?')
+ ->execute($checksum);
+
+ if ($exists->numRows > 0) {
+ return;
+ }
+
+ $text = $this->parsePdf($url);
+ if ($text === '') {
+ return;
+ }
+
+ $db
+ ->prepare('
+ INSERT INTO tl_search
+ (tstamp, title, url, text, checksum, protected, pid, type)
+ VALUES
+ (?, ?, ?, ?, ?, ?, ?, ?)
+ ')
+ ->execute(
+ time(),
+ StringUtil::decodeEntities($title),
+ $url,
+ $text,
+ $checksum,
+ '',
+ 0,
+ 'file'
+ );
+ }
+
+ /**
+ * -------------------------------------
+ * PDF parsen (Smalot)
+ * -------------------------------------
+ */
+ private function parsePdf(string $url): string
+ {
try {
- $pdfContent = @file_get_contents($url);
- if (!$pdfContent) {
+ $content = @file_get_contents($url);
+ if (!$content) {
return '';
}
$parser = new Parser();
- $pdf = $parser->parseContent($pdfContent);
+ $pdf = $parser->parseContent($content);
$text = $this->cleanPdfContent($pdf->getText());
- // Begrenzung für Meilisearch
- return mb_substr($text, 0, 2000);
+ // Begrenzen (Performance + Relevanz)
+ return mb_substr($text, 0, 5000);
} catch (\Throwable) {
return '';
@@ -186,17 +223,14 @@ class IndexPageListener
}
/**
- * Bereinigt PDF-Text
+ * -------------------------------------
+ * PDF-Text bereinigen
+ * -------------------------------------
*/
private function cleanPdfContent(string $content): string
{
- // UTF-8 normalisieren
$content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
-
- // Steuerzeichen entfernen
$content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
-
- // Whitespaces normalisieren
$content = preg_replace('/\s+/u', ' ', $content);
return trim($content);