This commit is contained in:
Jürgen Mummert
2025-12-25 14:46:40 +01:00
parent e3c29d70e9
commit c8069b1ce3
+71 -77
View File
@@ -3,11 +3,22 @@
namespace MummertMedia\ContaoMeilisearchBundle\Service; namespace MummertMedia\ContaoMeilisearchBundle\Service;
use Contao\Database; use Contao\Database;
use Contao\StringUtil;
use Smalot\PdfParser\Parser; use Smalot\PdfParser\Parser;
use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class PdfIndexService class PdfIndexService
{ {
private string $projectDir;
public function __construct(ParameterBagInterface $params)
{
// Contao 5 / Symfony-konform
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
}
/**
* Einstiegspunkt aus dem IndexPageListener
*/
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
error_log('PDF Service aufgerufen'); error_log('PDF Service aufgerufen');
@@ -18,24 +29,24 @@ class PdfIndexService
try { try {
error_log('bearbeite PDF: ' . $url); error_log('bearbeite PDF: ' . $url);
$normalizedUrl = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
error_log('umgewandelte URL: ' . var_export($normalizedUrl, true)); error_log('umgewandelte URL: ' . var_export($normalizedPath, true));
if ($normalizedUrl === null) { if ($normalizedPath === null) {
error_log('→ übersprungen: normalizePdfUrl() == null'); error_log('→ übersprungen: kein gültiger PDF-Pfad');
continue; continue;
} }
$absolutePath = $this->getAbsolutePath($normalizedUrl); $absolutePath = $this->getAbsolutePath($normalizedPath);
error_log('absoluter Pfad: ' . var_export($absolutePath, true)); error_log('absoluter Pfad: ' . var_export($absolutePath, true));
if ($absolutePath === null || !is_file($absolutePath)) { if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei nicht gefunden'); error_log('→ übersprungen: Datei existiert nicht');
continue; continue;
} }
$mtime = filemtime($absolutePath) ?: 0; $mtime = filemtime($absolutePath) ?: 0;
$checksum = md5($normalizedUrl . $mtime); $checksum = md5($normalizedPath . $mtime);
if ($this->alreadyIndexed($checksum)) { if ($this->alreadyIndexed($checksum)) {
error_log('→ übersprungen: bereits indexiert'); error_log('→ übersprungen: bereits indexiert');
@@ -47,12 +58,20 @@ class PdfIndexService
$text = $this->parsePdf($absolutePath); $text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
error_log('→ übersprungen: parsePdf() leer'); error_log('→ übersprungen: PDF ohne Textinhalt');
continue; continue;
} }
$this->insertPdf($normalizedUrl, $title, $text, $checksum, $mtime); $this->insertPdf(
$normalizedPath,
$title,
$text,
$checksum,
$mtime
);
error_log('geschrieben in tl_search_pdf'); error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) { } catch (\Throwable $e) {
error_log('PDF Service FEHLER (pro PDF): ' . $e->getMessage()); error_log('PDF Service FEHLER (pro PDF): ' . $e->getMessage());
error_log($e->getTraceAsString()); error_log($e->getTraceAsString());
@@ -61,40 +80,37 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* PDF-Parsing * URL → relativer /files-Pfad
* ===================================================== */ * ===================================================== */
private function normalizePdfUrl(string $url): ?string
private function parsePdf(string $absolutePath): string
{ {
try { $url = html_entity_decode($url);
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText()); // direkter /files/*.pdf-Link
$path = parse_url($url, PHP_URL_PATH);
return mb_substr($text, 0, 5000); if ($path && preg_match('~^/files/.*\.pdf$~i', $path)) {
} catch (\Throwable $e) { return $path;
error_log('→ Fehler beim Parsen der PDF: ' . $e->getMessage());
return '';
}
} }
private function cleanPdfContent(string $content): string return null;
{
$content = StringUtil::decodeEntities($content);
$content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
$content = preg_replace('/\s+/u', ' ', $content);
return trim($content);
} }
/* ===================================================== /* =====================================================
* DB * relativer Pfad → absoluter Pfad
* ===================================================== */ * ===================================================== */
private function getAbsolutePath(string $relativePath): string
{
return $this->projectDir . '/' . ltrim($relativePath, '/');
}
/* =====================================================
* DB-Helfer
* ===================================================== */
private function alreadyIndexed(string $checksum): bool private function alreadyIndexed(string $checksum): bool
{ {
$result = Database::getInstance() $db = Database::getInstance();
$result = $db
->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?') ->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?')
->execute($checksum); ->execute($checksum);
@@ -102,22 +118,23 @@ class PdfIndexService
} }
private function insertPdf( private function insertPdf(
string $url, string $path,
string $title, string $title,
string $text, string $text,
string $checksum, string $checksum,
int $mtime int $mtime
): void { ): void {
Database::getInstance() $db = Database::getInstance();
$db
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, path, title, text, checksum, file_mtime)
VALUES VALUES (?, ?, ?, ?, ?, ?)
(?, ?, ?, ?, ?, ?)
') ')
->execute( ->execute(
time(), time(),
$url, $path,
$title, $title,
$text, $text,
$checksum, $checksum,
@@ -126,53 +143,30 @@ class PdfIndexService
} }
/* ===================================================== /* =====================================================
* URL & Pfad-Helfer * PDF-Parsing
* ===================================================== */ * ===================================================== */
private function parsePdf(string $absolutePath): string
private function normalizePdfUrl(string $url): ?string
{ {
$url = html_entity_decode($url); try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
// 1) direkter /files/*.pdf-Link (immer korrekt) $text = $this->cleanPdfContent($pdf->getText());
$path = parse_url($url, PHP_URL_PATH);
if ($path && preg_match('~^/files/.*\.pdf$~i', $path)) {
return $path;
}
// 2) Query-Parameter prüfen // bewusst begrenzen (Performance + Relevanz)
$query = parse_url($url, PHP_URL_QUERY); return mb_substr($text, 0, 5000);
if (!$query) {
return null;
}
parse_str($query, $params); } catch (\Throwable $e) {
error_log('PDF Parser FEHLER: ' . $e->getMessage());
// 2a) Contao p=pdf/xyz.pdf return '';
if (!empty($params['p']) && preg_match('~\.pdf$~i', $params['p'])) {
return '/files/' . ltrim($params['p'], '/');
}
// 2b) Contao Download: f=Dateiname → Dateisystem suchen
if (!empty($params['f'])) {
$file = basename($params['f']);
// Suche im /files-Verzeichnis (rekursiv, aber schnell genug)
$matches = glob(TL_ROOT . '/files/**/' . $file, GLOB_BRACE);
if (!empty($matches)) {
return str_replace(TL_ROOT, '', $matches[0]);
} }
} }
return null; private function cleanPdfContent(string $content): string
}
private function getAbsolutePath(string $url): ?string
{ {
if (!str_starts_with($url, '/files/')) { $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
return null; $content = preg_replace('/\s+/u', ' ', $content);
}
return TL_ROOT . $url; return trim($content);
} }
} }