diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php
index f47f07e..9852cdb 100644
--- a/src/EventListener/IndexPageListener.php
+++ b/src/EventListener/IndexPageListener.php
@@ -2,10 +2,6 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
-use Contao\Database;
-use Contao\StringUtil;
-use Smalot\PdfParser\Parser;
-
class IndexPageListener
{
public function onIndexPage(string $content, array &$data, array &$set): void
@@ -96,10 +92,10 @@ class IndexPageListener
/*
* =====================
- * PDFS ALS EIGENE DOKUMENTE INDEXIEREN
+ * PDF DEBUG (NUR ERKENNEN)
* =====================
*/
- $this->indexPdfLinks($content);
+ $this->debugPdfLinks($content);
}
/* =====================================================
@@ -119,145 +115,17 @@ class IndexPageListener
}
/* =====================================================
- * PDF-Link-Erkennung
+ * PDF DEBUG
* ===================================================== */
- private function indexPdfLinks(string $content): void
+ private function debugPdfLinks(string $content): void
{
- if (!preg_match_all(
- '/]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is',
- $content,
- $matches
+ // irgendein mit PDF (direkt oder über ?p=)
+ if (preg_match(
+ '/]*href=["\'][^"\']*(\.pdf|\bp=pdf\/)[^"\']*["\']/i',
+ $content
)) {
- return;
+ error_log('PDF-Link gefunden');
}
-
- foreach ($matches[1] as $i => $href) {
- $title = trim(strip_tags($matches[2][$i])) ?: 'PDF';
-
- $pdfUrl = $this->resolvePdfUrl($href);
- if ($pdfUrl === null) {
- continue;
- }
-
- $this->indexSinglePdf($pdfUrl, $title);
- }
- }
-
- /**
- * Erkennt
- * - direkte PDF-Links (/files/...pdf)
- * - Contao-Download-Links (?p=...pdf)
- */
- private function resolvePdfUrl(string $href): ?string
- {
- $href = html_entity_decode($href);
-
- // 1) Direkter PDF-Link
- $path = parse_url($href, PHP_URL_PATH);
- if ($path && str_ends_with(strtolower($path), '.pdf')) {
- return $this->normalizeUrl($href);
- }
-
- // 2) Contao-Download-Link (?p=...pdf)
- $query = parse_url($href, PHP_URL_QUERY);
- if (!$query) {
- return null;
- }
-
- parse_str($query, $params);
-
- if (
- empty($params['p']) ||
- !str_ends_with(strtolower($params['p']), '.pdf')
- ) {
- return null;
- }
-
- return $this->normalizeUrl('/files/' . ltrim($params['p'], '/'));
- }
-
- private function normalizeUrl(string $url): string
- {
- if (preg_match('~^https?://~i', $url)) {
- return $url;
- }
-
- return
- ($_SERVER['REQUEST_SCHEME'] ?? 'https')
- . '://' . ($_SERVER['HTTP_HOST'] ?? '')
- . '/' . ltrim($url, '/');
- }
-
- /* =====================================================
- * PDF-Indexierung
- * ===================================================== */
-
- private function indexSinglePdf(string $url, string $title): void
- {
- $checksum = md5($url);
- $db = Database::getInstance();
-
- // schon indexiert?
- $exists = $db
- ->prepare('SELECT id FROM tl_search WHERE checksum=?')
- ->execute($checksum);
-
- if ($exists->numRows > 0) {
- return;
- }
-
- $text = $this->parsePdf($url);
- if ($text === '') {
- return;
- }
-
- $db
- ->prepare('
- INSERT INTO tl_search
- (tstamp, title, url, text, checksum, protected, pid, type)
- VALUES
- (?, ?, ?, ?, ?, ?, ?, ?)
- ')
- ->execute(
- time(),
- StringUtil::decodeEntities($title),
- $url,
- $text,
- $checksum,
- '',
- 0,
- 'file'
- );
- }
-
- private function parsePdf(string $url): string
- {
- try {
- $content = @file_get_contents($url);
- if (!$content) {
- return '';
- }
-
- $parser = new Parser();
- $pdf = $parser->parseContent($content);
-
- $text = $this->cleanPdfContent($pdf->getText());
-
- // bewusst begrenzen (Performance + Relevanz)
- return mb_substr($text, 0, 5000);
-
- } catch (\Throwable) {
- return '';
- }
- }
-
- private function cleanPdfContent(string $content): string
- {
- $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8');
- $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
- $content = preg_replace('/\s+/u', ' ', $content);
-
- return trim($content);
}
}
\ No newline at end of file