Bugfix
This commit is contained in:
@@ -101,18 +101,18 @@ class IndexPageListener
|
|||||||
*/
|
*/
|
||||||
$pdfLinks = $this->findPdfLinks($content);
|
$pdfLinks = $this->findPdfLinks($content);
|
||||||
|
|
||||||
if ($pdfLinks !== []) {
|
if ($pdfLinks === []) {
|
||||||
error_log('PDF gefunden');
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// PdfIndexService lazy aus dem Container holen
|
// Service lazy aus Container holen
|
||||||
if ($this->pdfIndexService === null) {
|
if ($this->pdfIndexService === null) {
|
||||||
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
|
$this->pdfIndexService = System::getContainer()->get(PdfIndexService::class);
|
||||||
|
$this->pdfIndexService->resetTableOnce();
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->pdfIndexService->startCrawl();
|
|
||||||
$this->pdfIndexService->handlePdfLinks($pdfLinks);
|
$this->pdfIndexService->handlePdfLinks($pdfLinks);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* JSON aus Marker extrahieren
|
* JSON aus Marker extrahieren
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace MummertMedia\ContaoMeilisearchBundle\EventListener;
|
|
||||||
|
|
||||||
use MummertMedia\ContaoMeilisearchBundle\Service\PdfIndexService;
|
|
||||||
|
|
||||||
class PdfCleanupListener
|
|
||||||
{
|
|
||||||
public function __construct(
|
|
||||||
private PdfIndexService $pdfIndexService
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function onLastChunk(): void
|
|
||||||
{
|
|
||||||
error_log('Crawler beendet → PDF Cleanup startet');
|
|
||||||
$this->pdfIndexService->cleanupRemovedPdfs();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -10,7 +10,3 @@ services:
|
|||||||
MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener:
|
MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener:
|
||||||
tags:
|
tags:
|
||||||
- { name: contao.hook, hook: indexPage, method: onIndexPage }
|
- { name: contao.hook, hook: indexPage, method: onIndexPage }
|
||||||
|
|
||||||
MummertMedia\ContaoMeilisearchBundle\EventListener\PdfCleanupListener:
|
|
||||||
tags:
|
|
||||||
- { name: kernel.event_listener, event: terminal42.escargot.last_chunk, method: onLastChunk }
|
|
||||||
@@ -8,63 +8,43 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
|
|||||||
|
|
||||||
class PdfIndexService
|
class PdfIndexService
|
||||||
{
|
{
|
||||||
|
|
||||||
private int $crawlStart = 0;
|
|
||||||
|
|
||||||
public function startCrawl(): void
|
|
||||||
{
|
|
||||||
if ($this->crawlStart === 0) {
|
|
||||||
$this->crawlStart = time();
|
|
||||||
error_log('PDF Crawl Start: ' . $this->crawlStart);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public function cleanupRemovedPdfs(): void
|
|
||||||
{
|
|
||||||
if ($this->crawlStart === 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Database::getInstance()
|
|
||||||
->prepare('DELETE FROM tl_search_pdf WHERE tstamp < ?')
|
|
||||||
->execute($this->crawlStart);
|
|
||||||
|
|
||||||
error_log('PDF Cleanup abgeschlossen');
|
|
||||||
}
|
|
||||||
private string $projectDir;
|
private string $projectDir;
|
||||||
|
private bool $tableReset = false;
|
||||||
|
|
||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
// Contao 5 / Symfony-konform
|
|
||||||
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
|
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/* =====================================================
|
||||||
* Einstiegspunkt aus dem IndexPageListener
|
* Reset tl_search_pdf einmal pro Crawl
|
||||||
*/
|
* ===================================================== */
|
||||||
|
public function resetTableOnce(): void
|
||||||
|
{
|
||||||
|
if ($this->tableReset) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
|
||||||
|
$this->tableReset = true;
|
||||||
|
|
||||||
|
error_log('PDF Reset: tl_search_pdf geleert');
|
||||||
|
}
|
||||||
|
|
||||||
|
/* =====================================================
|
||||||
|
* Einstiegspunkt aus Listener
|
||||||
|
* ===================================================== */
|
||||||
public function handlePdfLinks(array $pdfLinks): void
|
public function handlePdfLinks(array $pdfLinks): void
|
||||||
{
|
{
|
||||||
error_log('PDF Service aufgerufen');
|
|
||||||
error_log('PDF Links Count: ' . count($pdfLinks));
|
|
||||||
error_log('PDF Links: ' . json_encode($pdfLinks, JSON_UNESCAPED_SLASHES));
|
|
||||||
|
|
||||||
foreach ($pdfLinks as $url) {
|
foreach ($pdfLinks as $url) {
|
||||||
try {
|
try {
|
||||||
error_log('bearbeite PDF: ' . $url);
|
|
||||||
|
|
||||||
$normalizedPath = $this->normalizePdfUrl($url);
|
$normalizedPath = $this->normalizePdfUrl($url);
|
||||||
error_log('umgewandelte URL: ' . var_export($normalizedPath, true));
|
|
||||||
|
|
||||||
if ($normalizedPath === null) {
|
if ($normalizedPath === null) {
|
||||||
error_log('→ übersprungen: kein gültiger PDF-Pfad');
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
||||||
error_log('absoluter Pfad: ' . var_export($absolutePath, true));
|
|
||||||
|
|
||||||
if (!is_file($absolutePath)) {
|
if (!is_file($absolutePath)) {
|
||||||
error_log('→ übersprungen: Datei existiert nicht');
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -72,32 +52,24 @@ class PdfIndexService
|
|||||||
$checksum = md5($normalizedPath . $mtime);
|
$checksum = md5($normalizedPath . $mtime);
|
||||||
|
|
||||||
if ($this->alreadyIndexed($checksum)) {
|
if ($this->alreadyIndexed($checksum)) {
|
||||||
error_log('→ übersprungen: bereits indexiert');
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$title = basename($absolutePath);
|
|
||||||
error_log('gefundener Title: ' . $title);
|
|
||||||
|
|
||||||
$text = $this->parsePdf($absolutePath);
|
$text = $this->parsePdf($absolutePath);
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
error_log('→ übersprungen: PDF ohne Textinhalt');
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->insertPdf(
|
$this->insertPdf(
|
||||||
$normalizedPath,
|
$normalizedPath,
|
||||||
$title,
|
basename($absolutePath),
|
||||||
$text,
|
$text,
|
||||||
$checksum,
|
$checksum,
|
||||||
$mtime
|
$mtime
|
||||||
);
|
);
|
||||||
|
|
||||||
error_log('geschrieben in tl_search_pdf');
|
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
error_log('PDF Service FEHLER (pro PDF): ' . $e->getMessage());
|
error_log('PDF Service Fehler: ' . $e->getMessage());
|
||||||
error_log($e->getTraceAsString());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -107,15 +79,13 @@ class PdfIndexService
|
|||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function normalizePdfUrl(string $url): ?string
|
private function normalizePdfUrl(string $url): ?string
|
||||||
{
|
{
|
||||||
// Fall 1: direkter /files/-Pfad
|
|
||||||
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
|
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
|
||||||
return $url;
|
return $url;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fall 2: Contao-Download-Link mit ?p=
|
|
||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
|
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
if (!isset($parts['query'])) {
|
if (!isset($parts['query'])) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -123,7 +93,6 @@ class PdfIndexService
|
|||||||
parse_str($parts['query'], $query);
|
parse_str($parts['query'], $query);
|
||||||
|
|
||||||
if (!empty($query['p'])) {
|
if (!empty($query['p'])) {
|
||||||
// Contao speichert Pfade relativ zu /files
|
|
||||||
return '/files/' . ltrim($query['p'], '/');
|
return '/files/' . ltrim($query['p'], '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -131,7 +100,7 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* relativer Pfad → absoluter Pfad
|
* Pfade
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function getAbsolutePath(string $relativePath): string
|
private function getAbsolutePath(string $relativePath): string
|
||||||
{
|
{
|
||||||
@@ -139,13 +108,11 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* DB-Helfer
|
* DB
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function alreadyIndexed(string $checksum): bool
|
private function alreadyIndexed(string $checksum): bool
|
||||||
{
|
{
|
||||||
$db = Database::getInstance();
|
$result = Database::getInstance()
|
||||||
|
|
||||||
$result = $db
|
|
||||||
->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?')
|
->prepare('SELECT id FROM tl_search_pdf WHERE checksum = ?')
|
||||||
->execute($checksum);
|
->execute($checksum);
|
||||||
|
|
||||||
@@ -159,9 +126,7 @@ class PdfIndexService
|
|||||||
string $checksum,
|
string $checksum,
|
||||||
int $mtime
|
int $mtime
|
||||||
): void {
|
): void {
|
||||||
$db = Database::getInstance();
|
Database::getInstance()
|
||||||
|
|
||||||
$db
|
|
||||||
->prepare('
|
->prepare('
|
||||||
INSERT INTO tl_search_pdf
|
INSERT INTO tl_search_pdf
|
||||||
(tstamp, url, title, text, checksum, file_mtime)
|
(tstamp, url, title, text, checksum, file_mtime)
|
||||||
@@ -186,43 +151,18 @@ class PdfIndexService
|
|||||||
$parser = new Parser();
|
$parser = new Parser();
|
||||||
$pdf = $parser->parseFile($absolutePath);
|
$pdf = $parser->parseFile($absolutePath);
|
||||||
|
|
||||||
$text = $this->cleanPdfContent($pdf->getText());
|
$text = $pdf->getText();
|
||||||
|
|
||||||
// bewusst begrenzen (Performance + Relevanz)
|
|
||||||
return mb_substr($text, 0, 5000);
|
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
|
||||||
error_log('PDF Parser FEHLER: ' . $e->getMessage());
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private function cleanPdfContent(string $text): string
|
|
||||||
{
|
|
||||||
// 1. Unicode normalisieren (wichtig!)
|
|
||||||
if (class_exists(\Normalizer::class)) {
|
if (class_exists(\Normalizer::class)) {
|
||||||
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Musik- & Spezialglyphen entfernen
|
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}]/u', ' ', $text);
|
||||||
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
|
||||||
|
|
||||||
// 3. Falsche Worttrennungen reparieren: "ges pielt" → "gespielt"
|
|
||||||
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
|
|
||||||
|
|
||||||
// 4. Spezielle PDF-Apostrophe reparieren
|
|
||||||
$text = str_replace(
|
|
||||||
["\\'", "’", "‘"],
|
|
||||||
"'",
|
|
||||||
$text
|
|
||||||
);
|
|
||||||
|
|
||||||
// 5. Mehrfache Satzzeichen bereinigen
|
|
||||||
$text = preg_replace('/([.,;:!?])\1+/', '$1', $text);
|
|
||||||
|
|
||||||
// 6. Überflüssige Leerzeichen & Zeilenumbrüche
|
|
||||||
$text = preg_replace('/\s+/u', ' ', $text);
|
$text = preg_replace('/\s+/u', ' ', $text);
|
||||||
|
|
||||||
return trim($text);
|
return trim(mb_substr($text, 0, 5000));
|
||||||
|
} catch (\Throwable) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user