This commit is contained in:
Jürgen Mummert
2025-12-25 14:19:23 +01:00
parent ebbfdface7
commit 1ae7f910ca
2 changed files with 36 additions and 22 deletions
+24 -22
View File
@@ -2,8 +2,14 @@
namespace MummertMedia\ContaoMeilisearchBundle\EventListener; namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService;
class IndexPageListener class IndexPageListener
{ {
public function __construct(
private readonly PdfIndexService $pdfIndexService
) {}
public function onIndexPage(string $content, array &$data, array &$set): void public function onIndexPage(string $content, array &$data, array &$set): void
{ {
// Marker vorhanden? // Marker vorhanden?
@@ -92,17 +98,14 @@ class IndexPageListener
/* /*
* ===================== * =====================
* DEBUG: KOMPLETTES MARKUP * PDF-ERKENNUNG
* ===================== * =====================
*/ */
$this->debugMarkup($content); $pdfLinks = $this->findPdfLinks($content);
/* if ($pdfLinks !== []) {
* ===================== $this->pdfIndexService->handlePdfLinks($pdfLinks);
* DEBUG: PDF-LINK GEFUNDEN }
* =====================
*/
$this->debugPdfLinks($content);
} }
private function extractMeilisearchJson(string $content): ?array private function extractMeilisearchJson(string $content): ?array
@@ -117,22 +120,21 @@ class IndexPageListener
return is_array($data) ? $data : null; return is_array($data) ? $data : null;
} }
private function debugMarkup(string $content): void /**
* Erkennt:
* - direkte .pdf-Links
* - Contao-Download-Links (?p=pdf/ oder ?p=pdf%2F)
*/
private function findPdfLinks(string $content): array
{ {
error_log( if (!preg_match_all(
"\n========== CRAWLER MARKUP START ==========\n" '/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i',
. $content . $content,
"\n=========== CRAWLER MARKUP END ===========\n" $matches
); )) {
return [];
} }
private function debugPdfLinks(string $content): void return array_unique(array_map('html_entity_decode', $matches[1]));
{
if (preg_match(
'/<a\s+[^>]*href=["\'][^"\']*(\.pdf|p=pdf(%2F|\/))[^"\']*["\']/i',
$content
)) {
error_log('PDF-Link gefunden');
}
} }
} }
+12
View File
@@ -0,0 +1,12 @@
<?php
namespace MummertMedia\ContaoMeilisearchBundle\Service;
class PdfIndexService
{
public function handlePdfLinks(array $pdfLinks): void
{
// Platzhalter Logik kommt später
error_log('PDF-Parser-Service aufgerufen');
}
}