This commit is contained in:
Jürgen Mummert
2025-12-26 11:24:55 +01:00
parent 7415df53d7
commit 8f3a0ad1b2
2 changed files with 352 additions and 75 deletions
+61 -27
View File
@@ -5,40 +5,34 @@ namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
use Contao\Config; use Contao\Config;
use Contao\System; use Contao\System;
use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService; use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService;
use MummertMedia\ContaoMeilisearchBundle\Service\OfficeIndexService;
class IndexPageListener class IndexPageListener
{ {
private ?PdfIndexService $pdfIndexService = null; private ?PdfIndexService $pdfIndexService = null;
private ?OfficeIndexService $officeIndexService = null;
public function onIndexPage(string $content, array &$data, array &$set): void public function onIndexPage(string $content, array &$data, array &$set): void
{ {
// ✅ IMMER: Service einmal pro Crawl holen + Tabelle einmal leeren // ✅ IMMER: Service einmal pro Crawl holen + Tabelle einmal leeren
if ($this->pdfIndexService === null) { if ($this->pdfIndexService === null) {
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class); $this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
$this->pdfIndexService->resetTableOnce(); // <- darf NICHT von Checkbox abhängen! $this->pdfIndexService->resetTableOnce(); // darf NICHT von Checkboxen abhängen
}
// ✅ Checkbox steuert nur die PDF-Suche/Indexierung (nicht den Reset!)
$pdfEnabled = (bool) (Config::get('meilisearchIndexPdfs') ?? Config::get('meilisearch_index_pdfs'));
if (!$pdfEnabled) {
return;
}
// Marker vorhanden?
if (!str_contains($content, 'MEILISEARCH_JSON')) {
return;
}
$parsed = $this->extractMeilisearchJson($content);
if ($parsed === null) {
return;
} }
/* /*
* ===================== * =====================
* PRIORITY * SEITEN-METADATEN (IMMER)
* ===================== * =====================
*/ */
if (str_contains($content, 'MEILISEARCH_JSON')) {
$parsed = $this->extractMeilisearchJson($content);
if (is_array($parsed)) {
/*
* PRIORITY
*/
$priority = $priority =
$parsed['event']['priority'] ?? null ?? $parsed['event']['priority'] ?? null ??
$parsed['news']['priority'] ?? null ?? $parsed['news']['priority'] ?? null ??
@@ -49,9 +43,7 @@ class IndexPageListener
} }
/* /*
* =====================
* KEYWORDS * KEYWORDS
* =====================
*/ */
$keywordSources = [ $keywordSources = [
$parsed['event']['keywords'] ?? null, $parsed['event']['keywords'] ?? null,
@@ -78,9 +70,7 @@ class IndexPageListener
} }
/* /*
* =====================
* IMAGEPATH * IMAGEPATH
* =====================
*/ */
$image = $image =
$parsed['event']['searchimage'] ?? null ?? $parsed['event']['searchimage'] ?? null ??
@@ -93,9 +83,7 @@ class IndexPageListener
} }
/* /*
* =====================
* STARTDATE * STARTDATE
* =====================
*/ */
$date = $date =
$parsed['event']['date'] ?? null ?? $parsed['event']['date'] ?? null ??
@@ -107,20 +95,44 @@ class IndexPageListener
$set['startDate'] = $ts; $set['startDate'] = $ts;
} }
} }
}
}
/* /*
* ===================== * =====================
* PDF-ERKENNUNG * PDF-INDEXIERUNG (OPTIONAL)
* ===================== * =====================
*/ */
$pdfEnabled = (bool) Config::get('meilisearch_index_pdfs');
if ($pdfEnabled && (int) ($data['protected'] ?? 0) === 0) {
$pdfLinks = $this->findPdfLinks($content); $pdfLinks = $this->findPdfLinks($content);
// PDFs NUR auf öffentlichen Seiten indexieren if ($pdfLinks !== []) {
if ($pdfLinks !== [] && (int) ($data['protected'] ?? 0) === 0) {
$this->pdfIndexService->handlePdfLinks($pdfLinks); $this->pdfIndexService->handlePdfLinks($pdfLinks);
} }
} }
/*
* =====================
* OFFICE-INDEXIERUNG (OPTIONAL)
* =====================
*/
$officeEnabled = (bool) Config::get('meilisearch_index_office');
if ($officeEnabled && (int) ($data['protected'] ?? 0) === 0) {
if ($this->officeIndexService === null) {
$this->officeIndexService = System::getContainer()->get(OfficeIndexService::class);
}
$officeLinks = $this->findOfficeLinks($content);
if ($officeLinks !== []) {
$this->officeIndexService->handleOfficeLinks($officeLinks);
}
}
}
private function extractMeilisearchJson(string $content): ?array private function extractMeilisearchJson(string $content): ?array
{ {
if (!preg_match('/<!--\s*MEILISEARCH_JSON\s*(\{.*?\})\s*-->/s', $content, $m)) { if (!preg_match('/<!--\s*MEILISEARCH_JSON\s*(\{.*?\})\s*-->/s', $content, $m)) {
@@ -154,4 +166,26 @@ class IndexPageListener
return $result; return $result;
} }
private function findOfficeLinks(string $content): array
{
if (!preg_match_all(
'/<a\s+[^>]*href=["\']([^"\']*(?:\.(?:docx|xlsx|pptx)|p=(?:docx|xlsx|pptx)(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
$content,
$matches
)) {
return [];
}
$result = [];
foreach ($matches[1] as $i => $href) {
$result[] = [
'url' => html_entity_decode($href),
'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
];
}
return $result;
}
} }
+243
View File
@@ -0,0 +1,243 @@
<?php
namespace MummertMedia\ContaoMeilisearchBundle\Service;
use Contao\Database;
use PhpOffice\PhpWord\IOFactory as WordIOFactory;
use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
class OfficeIndexService
{
private string $projectDir;
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params)
{
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
}
/**
* @param array<int,array{url:string,linkText:?string}> $officeLinks
*/
public function handleOfficeLinks(array $officeLinks): void
{
foreach ($officeLinks as $row) {
$url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null;
if ($url === '') {
continue;
}
try {
error_log('bearbeite Office-Datei: ' . $url);
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) {
error_log('→ übersprungen: bereits im Crawl verarbeitet');
continue;
}
$this->seenThisCrawl[$seenKey] = true;
$normalized = $this->normalizeOfficeUrl($url);
if ($normalized === null) {
error_log('→ übersprungen: kein gültiger Office-Pfad');
continue;
}
[$relativePath, $type] = $normalized;
$absolutePath = $this->getAbsolutePath($relativePath);
if (!is_file($absolutePath)) {
error_log('→ übersprungen: Datei existiert nicht: ' . $absolutePath);
continue;
}
$mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($relativePath . '|' . $mtime);
$title = $linkText ?: basename($absolutePath);
$text = $this->parseOfficeFile($absolutePath, $type);
if ($text === '') {
error_log('→ übersprungen: Office-Datei ohne Textinhalt');
continue;
}
$this->upsertOffice(
$relativePath,
$title,
$text,
$checksum,
$mtime,
$type
);
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) {
error_log('Office Service FEHLER: ' . $e->getMessage());
}
}
}
/**
* @return array{string,string}|null [relativePath, type]
*/
private function normalizeOfficeUrl(string $url): ?array
{
$decoded = html_entity_decode($url);
$parts = parse_url($decoded);
// direkter /files/-Pfad
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/')) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
return [$parts['path'], $ext];
}
}
// Contao-Download-Link mit ?p=
if (!empty($parts['query'])) {
parse_str($parts['query'], $query);
if (!empty($query['p'])) {
$p = rawurldecode((string) $query['p']);
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
return ['/files/' . ltrim($p, '/'), $ext];
}
}
}
return null;
}
private function getAbsolutePath(string $relativePath): string
{
return $this->projectDir . '/' . ltrim($relativePath, '/');
}
private function upsertOffice(
string $url,
string $title,
string $text,
string $checksum,
int $mtime,
string $type
): void {
$db = Database::getInstance();
$db->prepare('
INSERT INTO tl_search_pdf
(tstamp, type, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp),
type=VALUES(type),
url=VALUES(url),
title=VALUES(title),
text=VALUES(text),
file_mtime=VALUES(file_mtime)
')->execute(
time(),
$type,
$url,
$title,
$text,
$checksum,
$mtime
);
}
private function parseOfficeFile(string $absolutePath, string $type): string
{
return match ($type) {
'docx' => $this->parseDocx($absolutePath),
'xlsx' => $this->parseXlsx($absolutePath),
'pptx' => $this->parsePptx($absolutePath),
default => '',
};
}
private function parseDocx(string $absolutePath): string
{
try {
$phpWord = WordIOFactory::load($absolutePath);
$text = '';
foreach ($phpWord->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (method_exists($element, 'getText')) {
$text .= ' ' . $element->getText();
}
}
}
return $this->cleanText($text);
} catch (\Throwable) {
return '';
}
}
private function parseXlsx(string $absolutePath): string
{
try {
$spreadsheet = SpreadsheetIOFactory::load($absolutePath);
$text = '';
foreach ($spreadsheet->getAllSheets() as $sheet) {
foreach ($sheet->toArray() as $row) {
$text .= ' ' . implode(' ', array_filter($row, 'is_scalar'));
}
}
return $this->cleanText($text);
} catch (\Throwable) {
return '';
}
}
private function parsePptx(string $absolutePath): string
{
try {
$presentation = PresentationIOFactory::load($absolutePath);
$text = '';
foreach ($presentation->getAllSlides() as $slide) {
foreach ($slide->getShapeCollection() as $shape) {
if (method_exists($shape, 'getPlainText')) {
$text .= ' ' . $shape->getPlainText();
}
}
}
return $this->cleanText($text);
} catch (\Throwable) {
return '';
}
}
private function cleanText(string $text): string
{
if (class_exists(\Normalizer::class)) {
$text = \Normalizer::normalize($text, \Normalizer::FORM_C) ?? $text;
}
$text = str_replace(["\r\n", "\r"], "\n", $text);
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
$text = preg_replace('/\s+/u', ' ', $text);
return trim(mb_substr($text, 0, 20000));
}
}