This commit is contained in:
Jürgen Mummert
2025-12-23 13:45:10 +01:00
parent 4a09f8530a
commit 9fa59c4b1a
+86 -52
View File
@@ -2,6 +2,8 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener; namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use Contao\Database;
use Contao\StringUtil;
use Smalot\PdfParser\Parser; use Smalot\PdfParser\Parser;
class IndexPageListener class IndexPageListener
@@ -13,16 +15,15 @@ class IndexPageListener
return; return;
} }
// JSON aus Kommentar extrahieren + parsen // JSON aus Kommentar extrahieren
$parsed = $this->extractMeilisearchJson($content); $parsed = $this->extractMeilisearchJson($content);
if ($parsed === null) { if ($parsed === null) {
return; return;
} }
/* /*
* ===================== * =====================
* PRIORITY (event > news > page) * PRIORITY
* ===================== * =====================
*/ */
$priority = $priority =
@@ -36,7 +37,7 @@ class IndexPageListener
/* /*
* ===================== * =====================
* KEYWORDS (merge) * KEYWORDS
* ===================== * =====================
*/ */
$keywordSources = [ $keywordSources = [
@@ -45,26 +46,26 @@ class IndexPageListener
$parsed['page']['keywords'] ?? null, $parsed['page']['keywords'] ?? null,
]; ];
$kw = []; $keywords = [];
foreach ($keywordSources as $s) { foreach ($keywordSources as $src) {
if (!is_string($s) || trim($s) === '') { if (!is_string($src) || trim($src) === '') {
continue; continue;
} }
foreach (preg_split('/\s+/', trim($s)) ?: [] as $p) { foreach (preg_split('/\s+/', trim($src)) as $word) {
if ($p !== '') { if ($word !== '') {
$kw[] = $p; $keywords[] = $word;
} }
} }
} }
if ($kw) { if ($keywords) {
$set['keywords'] = implode(' ', array_unique($kw)); $set['keywords'] = implode(' ', array_unique($keywords));
} }
/* /*
* ===================== * =====================
* IMAGEPATH (event > news > page > custom) * IMAGEPATH
* ===================== * =====================
*/ */
$image = $image =
@@ -79,7 +80,7 @@ class IndexPageListener
/* /*
* ===================== * =====================
* STARTDATE (event/news) * STARTDATE
* ===================== * =====================
*/ */
$date = $date =
@@ -95,20 +96,16 @@ class IndexPageListener
/* /*
* ===================== * =====================
* PDF LINKS INDEXIEREN * PDFS ALS EIGENE DOKUMENTE INDEXIEREN
* ===================== * =====================
*/ */
$pdfText = $this->extractPdfTextFromContent($content); $this->indexPdfLinks($content);
if ($pdfText !== '') {
$set['text'] = trim(
($set['text'] ?? '') . "\n\n" . $pdfText
);
}
} }
/** /**
* Extrahiert das JSON aus <!-- MEILISEARCH_JSON {...} --> * -------------------------------------
* JSON aus Kommentar extrahieren
* -------------------------------------
*/ */
private function extractMeilisearchJson(string $content): ?array private function extractMeilisearchJson(string $content): ?array
{ {
@@ -123,39 +120,36 @@ class IndexPageListener
} }
/** /**
* Findet PDF-Links im HTML und extrahiert deren Text * -------------------------------------
* PDF-Links finden und indexieren
* -------------------------------------
*/ */
private function extractPdfTextFromContent(string $content): string private function indexPdfLinks(string $content): void
{ {
if (!preg_match_all( if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>/i', '/<a\s+[^>]*href=["\']([^"\']+\.pdf[^"\']*)["\'][^>]*>(.*?)<\/a>/is',
$content, $content,
$matches $matches
)) { )) {
return ''; return;
} }
$texts = []; foreach ($matches[1] as $i => $url) {
$title = trim(strip_tags($matches[2][$i])) ?: basename($url);
foreach ($matches[1] as $url) { $this->indexSinglePdf($url, $title);
$pdfText = $this->parsePdfFromUrl($url);
if ($pdfText !== '') {
$texts[] = $pdfText;
}
} }
return implode("\n\n", $texts);
} }
/** /**
* Lädt und parsed ein PDF (nur /files/) * -------------------------------------
* Einzelnes PDF indexieren
* -------------------------------------
*/ */
private function parsePdfFromUrl(string $url): string private function indexSinglePdf(string $url, string $title): void
{ {
// Nur interne PDFs // nur interne PDFs
if (!str_contains($url, '/files/')) { if (!str_contains($url, '/files/')) {
return ''; return;
} }
// relative URLs normalisieren // relative URLs normalisieren
@@ -166,19 +160,62 @@ class IndexPageListener
. $url; . $url;
} }
$checksum = md5($url);
$db = Database::getInstance();
// bereits indexiert?
$exists = $db
->prepare('SELECT id FROM tl_search WHERE checksum=?')
->execute($checksum);
if ($exists->numRows > 0) {
return;
}
$text = $this->parsePdf($url);
if ($text === '') {
return;
}
$db
->prepare('
INSERT INTO tl_search
(tstamp, title, url, text, checksum, protected, pid, type)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?)
')
->execute(
time(),
StringUtil::decodeEntities($title),
$url,
$text,
$checksum,
'',
0,
'file'
);
}
/**
* -------------------------------------
* PDF parsen (Smalot)
* -------------------------------------
*/
private function parsePdf(string $url): string
{
try { try {
$pdfContent = @file_get_contents($url); $content = @file_get_contents($url);
if (!$pdfContent) { if (!$content) {
return ''; return '';
} }
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseContent($pdfContent); $pdf = $parser->parseContent($content);
$text = $this->cleanPdfContent($pdf->getText()); $text = $this->cleanPdfContent($pdf->getText());
// Begrenzung für Meilisearch // Begrenzen (Performance + Relevanz)
return mb_substr($text, 0, 2000); return mb_substr($text, 0, 5000);
} catch (\Throwable) { } catch (\Throwable) {
return ''; return '';
@@ -186,17 +223,14 @@ class IndexPageListener
} }
/** /**
* Bereinigt PDF-Text * -------------------------------------
* PDF-Text bereinigen
* -------------------------------------
*/ */
private function cleanPdfContent(string $content): string private function cleanPdfContent(string $content): string
{ {
// UTF-8 normalisieren
$content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
// Steuerzeichen entfernen
$content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content); $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
// Whitespaces normalisieren
$content = preg_replace('/\s+/u', ' ', $content); $content = preg_replace('/\s+/u', ' ', $content);
return trim($content); return trim($content);