diff --git a/src/EventListener/IndexPageListener.php b/src/EventListener/IndexPageListener.php index 3c7f5d1..5cb648b 100644 --- a/src/EventListener/IndexPageListener.php +++ b/src/EventListener/IndexPageListener.php @@ -2,8 +2,14 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener; +use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService; + class IndexPageListener { + public function __construct( + private readonly PdfIndexService $pdfIndexService + ) {} + public function onIndexPage(string $content, array &$data, array &$set): void { // Marker vorhanden? @@ -92,17 +98,14 @@ class IndexPageListener /* * ===================== - * DEBUG: KOMPLETTES MARKUP + * PDF-ERKENNUNG * ===================== */ - $this->debugMarkup($content); + $pdfLinks = $this->findPdfLinks($content); - /* - * ===================== - * DEBUG: PDF-LINK GEFUNDEN - * ===================== - */ - $this->debugPdfLinks($content); + if ($pdfLinks !== []) { + $this->pdfIndexService->handlePdfLinks($pdfLinks); + } } private function extractMeilisearchJson(string $content): ?array @@ -117,22 +120,21 @@ class IndexPageListener return is_array($data) ? $data : null; } - private function debugMarkup(string $content): void + /** + * Erkennt: + * - direkte .pdf-Links + * - Contao-Download-Links (?p=pdf/ oder ?p=pdf%2F) + */ + private function findPdfLinks(string $content): array { - error_log( - "\n========== CRAWLER MARKUP START ==========\n" - . $content . - "\n=========== CRAWLER MARKUP END ===========\n" - ); - } - - private function debugPdfLinks(string $content): void - { - if (preg_match( - '/]*href=["\'][^"\']*(\.pdf|p=pdf(%2F|\/))[^"\']*["\']/i', - $content + if (!preg_match_all( + '/]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i', + $content, + $matches )) { - error_log('PDF-Link gefunden'); + return []; } + + return array_unique(array_map('html_entity_decode', $matches[1])); } } \ No newline at end of file diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php new file mode 100644 index 0000000..3a43116 --- /dev/null +++ b/src/Service/PdfIndexService.php @@ -0,0 +1,12 @@ +