This commit is contained in:
Jürgen Mummert
2025-12-25 13:58:22 +01:00
parent 020e3500da
commit 48d5316085
+9 -141
View File
@@ -2,10 +2,6 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener; namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use Contao\Database;
use Contao\StringUtil;
use Smalot\PdfParser\Parser;
class IndexPageListener class IndexPageListener
{ {
public function onIndexPage(string $content, array &$data, array &$set): void public function onIndexPage(string $content, array &$data, array &$set): void
@@ -96,10 +92,10 @@ class IndexPageListener
/* /*
* ===================== * =====================
* PDFS ALS EIGENE DOKUMENTE INDEXIEREN * PDF DEBUG (NUR ERKENNEN)
* ===================== * =====================
*/ */
$this->indexPdfLinks($content); $this->debugPdfLinks($content);
} }
/* ===================================================== /* =====================================================
@@ -119,145 +115,17 @@ class IndexPageListener
} }
/* ===================================================== /* =====================================================
* PDF-Link-Erkennung * PDF DEBUG
* ===================================================== */ * ===================================================== */
private function indexPdfLinks(string $content): void private function debugPdfLinks(string $content): void
{ {
if (!preg_match_all( // irgendein <a href="..."> mit PDF (direkt oder über ?p=)
'/<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is', if (preg_match(
$content, '/<a\s+[^>]*href=["\'][^"\']*(\.pdf|\bp=pdf\/)[^"\']*["\']/i',
$matches $content
)) { )) {
return; error_log('PDF-Link gefunden');
}
foreach ($matches[1] as $i => $href) {
$title = trim(strip_tags($matches[2][$i])) ?: 'PDF';
$pdfUrl = $this->resolvePdfUrl($href);
if ($pdfUrl === null) {
continue;
}
$this->indexSinglePdf($pdfUrl, $title);
} }
} }
/**
* Erkennt
* - direkte PDF-Links (/files/...pdf)
* - Contao-Download-Links (?p=...pdf)
*/
private function resolvePdfUrl(string $href): ?string
{
$href = html_entity_decode($href);
// 1) Direkter PDF-Link
$path = parse_url($href, PHP_URL_PATH);
if ($path && str_ends_with(strtolower($path), '.pdf')) {
return $this->normalizeUrl($href);
}
// 2) Contao-Download-Link (?p=...pdf)
$query = parse_url($href, PHP_URL_QUERY);
if (!$query) {
return null;
}
parse_str($query, $params);
if (
empty($params['p']) ||
!str_ends_with(strtolower($params['p']), '.pdf')
) {
return null;
}
return $this->normalizeUrl('/files/' . ltrim($params['p'], '/'));
}
private function normalizeUrl(string $url): string
{
if (preg_match('~^https?://~i', $url)) {
return $url;
}
return
($_SERVER['REQUEST_SCHEME'] ?? 'https')
. '://' . ($_SERVER['HTTP_HOST'] ?? '')
. '/' . ltrim($url, '/');
}
/* =====================================================
* PDF-Indexierung
* ===================================================== */
private function indexSinglePdf(string $url, string $title): void
{
$checksum = md5($url);
$db = Database::getInstance();
// schon indexiert?
$exists = $db
->prepare('SELECT id FROM tl_search WHERE checksum=?')
->execute($checksum);
if ($exists->numRows > 0) {
return;
}
$text = $this->parsePdf($url);
if ($text === '') {
return;
}
$db
->prepare('
INSERT INTO tl_search
(tstamp, title, url, text, checksum, protected, pid, type)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?)
')
->execute(
time(),
StringUtil::decodeEntities($title),
$url,
$text,
$checksum,
'',
0,
'file'
);
}
private function parsePdf(string $url): string
{
try {
$content = @file_get_contents($url);
if (!$content) {
return '';
}
$parser = new Parser();
$pdf = $parser->parseContent($content);
$text = $this->cleanPdfContent($pdf->getText());
// bewusst begrenzen (Performance + Relevanz)
return mb_substr($text, 0, 5000);
} catch (\Throwable) {
return '';
}
}
private function cleanPdfContent(string $content): string
{
$content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
$content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
$content = preg_replace('/\s+/u', ' ', $content);
return trim($content);
}
} }