Bugfix
This commit is contained in:
@@ -10,16 +10,11 @@ class PdfIndexService
|
|||||||
{
|
{
|
||||||
private string $projectDir;
|
private string $projectDir;
|
||||||
|
|
||||||
/**
|
/** @var bool */
|
||||||
* Merkt sich Checksums innerhalb eines Crawls
|
private bool $crawlInitialized = false;
|
||||||
* → verhindert Duplicate INSERTs
|
|
||||||
*/
|
|
||||||
private array $processedChecksums = [];
|
|
||||||
|
|
||||||
/**
|
/** @var array<string, bool> */
|
||||||
* Flag, damit das Reset nur 1× pro Crawl passiert
|
private array $processedChecksums = [];
|
||||||
*/
|
|
||||||
private bool $resetDone = false;
|
|
||||||
|
|
||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
@@ -27,27 +22,19 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* Crawl-Start: Tabelle leeren
|
* PUBLIC API
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
public function startCrawl(): void
|
|
||||||
{
|
|
||||||
if ($this->resetDone) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
|
/**
|
||||||
|
* Einstiegspunkt aus dem IndexPageListener
|
||||||
$this->processedChecksums = [];
|
*
|
||||||
$this->resetDone = true;
|
* @param array<int,array{url:string,text?:string|null}> $pdfLinks
|
||||||
|
*/
|
||||||
error_log('PDF Crawl Start → tl_search_pdf geleert');
|
|
||||||
}
|
|
||||||
|
|
||||||
/* =====================================================
|
|
||||||
* Einstiegspunkt aus dem Listener
|
|
||||||
* ===================================================== */
|
|
||||||
public function handlePdfLinks(array $pdfLinks): void
|
public function handlePdfLinks(array $pdfLinks): void
|
||||||
{
|
{
|
||||||
|
// 🔴 WICHTIG: Reset garantiert VOR dem ersten INSERT
|
||||||
|
$this->initializeCrawl();
|
||||||
|
|
||||||
foreach ($pdfLinks as $pdf) {
|
foreach ($pdfLinks as $pdf) {
|
||||||
try {
|
try {
|
||||||
$url = $pdf['url'];
|
$url = $pdf['url'];
|
||||||
@@ -61,30 +48,36 @@ class PdfIndexService
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$absolutePath = $this->getAbsolutePath($relativePath);
|
$absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
|
||||||
if (!is_file($absolutePath)) {
|
if (!is_file($absolutePath)) {
|
||||||
error_log('→ übersprungen: Datei existiert nicht');
|
error_log('→ übersprungen: Datei existiert nicht');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Datei-Zeitstempel
|
||||||
$mtime = filemtime($absolutePath) ?: 0;
|
$mtime = filemtime($absolutePath) ?: 0;
|
||||||
$checksum = md5($relativePath . $mtime);
|
|
||||||
|
|
||||||
|
// Stabiler Crawl-Checksum
|
||||||
|
$checksum = md5($relativePath . '|' . $mtime);
|
||||||
|
|
||||||
|
// Pro Crawl deduplizieren
|
||||||
if (isset($this->processedChecksums[$checksum])) {
|
if (isset($this->processedChecksums[$checksum])) {
|
||||||
error_log('→ übersprungen: bereits im Crawl verarbeitet');
|
error_log('→ übersprungen: bereits im Crawl verarbeitet');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->processedChecksums[$checksum] = true;
|
$this->processedChecksums[$checksum] = true;
|
||||||
|
|
||||||
$title = $this->resolveTitle($linkText, $absolutePath);
|
// Titel bestimmen
|
||||||
$text = $this->parsePdf($absolutePath);
|
$title = $this->resolveTitle($absolutePath, $linkText);
|
||||||
|
|
||||||
|
// PDF parsen
|
||||||
|
$text = $this->parsePdf($absolutePath);
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
error_log('→ übersprungen: PDF ohne Textinhalt');
|
error_log('→ übersprungen: PDF ohne Textinhalt');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Schreiben
|
||||||
$this->insertPdf(
|
$this->insertPdf(
|
||||||
$relativePath,
|
$relativePath,
|
||||||
$title,
|
$title,
|
||||||
@@ -97,45 +90,41 @@ class PdfIndexService
|
|||||||
|
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
error_log('PDF Service FEHLER: ' . $e->getMessage());
|
error_log('PDF Service FEHLER: ' . $e->getMessage());
|
||||||
|
error_log($e->getTraceAsString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* Titel-Ermittlung (Prio!)
|
* CRAWL-LIFECYCLE
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function resolveTitle(?string $linkText, string $absolutePath): string
|
|
||||||
|
private function initializeCrawl(): void
|
||||||
{
|
{
|
||||||
if (is_string($linkText) && trim($linkText) !== '') {
|
if ($this->crawlInitialized) {
|
||||||
return trim(strip_tags($linkText));
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
$this->crawlInitialized = true;
|
||||||
$parser = new Parser();
|
$this->processedChecksums = [];
|
||||||
$pdf = $parser->parseFile($absolutePath);
|
|
||||||
$details = $pdf->getDetails();
|
|
||||||
|
|
||||||
if (!empty($details['Title'])) {
|
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
|
||||||
return trim((string) $details['Title']);
|
|
||||||
}
|
|
||||||
} catch (\Throwable) {
|
|
||||||
// ignorieren
|
|
||||||
}
|
|
||||||
|
|
||||||
return basename($absolutePath);
|
error_log('PDF Crawl initialisiert → tl_search_pdf geleert');
|
||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* URL → relativer /files-Pfad
|
* URL-NORMALISIERUNG
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
|
|
||||||
private function normalizePdfUrl(string $url): ?string
|
private function normalizePdfUrl(string $url): ?string
|
||||||
{
|
{
|
||||||
// direkter /files-Link
|
// Direkter /files-Link
|
||||||
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
|
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
|
||||||
return $url;
|
return $url;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Contao Download-Link (?p=pdf/...)
|
// Contao Hash-/Download-Link (?p=)
|
||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
@@ -153,32 +142,53 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* relativer → absoluter Pfad
|
* TITEL-AUFLÖSUNG
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function getAbsolutePath(string $relativePath): string
|
|
||||||
|
private function resolveTitle(string $absolutePath, ?string $linkText): string
|
||||||
{
|
{
|
||||||
return $this->projectDir . '/' . ltrim($relativePath, '/');
|
// 1. Linktext aus HTML
|
||||||
|
if (is_string($linkText) && trim($linkText) !== '') {
|
||||||
|
return trim($linkText);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. PDF-Metadaten
|
||||||
|
try {
|
||||||
|
$parser = new Parser();
|
||||||
|
$pdf = $parser->parseFile($absolutePath);
|
||||||
|
$details = $pdf->getDetails();
|
||||||
|
|
||||||
|
if (!empty($details['Title'])) {
|
||||||
|
return trim((string) $details['Title']);
|
||||||
|
}
|
||||||
|
} catch (\Throwable) {
|
||||||
|
// ignorieren
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Fallback: Dateiname
|
||||||
|
return basename($absolutePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* DB INSERT
|
* DB
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
|
|
||||||
private function insertPdf(
|
private function insertPdf(
|
||||||
string $path,
|
string $url,
|
||||||
string $title,
|
string $title,
|
||||||
string $text,
|
string $text,
|
||||||
string $checksum,
|
string $checksum,
|
||||||
int $mtime
|
int $mtime
|
||||||
): void {
|
): void {
|
||||||
Database::getInstance()
|
Database::getInstance()
|
||||||
->prepare('
|
->prepare(
|
||||||
INSERT INTO tl_search_pdf
|
'INSERT INTO tl_search_pdf
|
||||||
(tstamp, url, title, text, checksum, file_mtime)
|
(tstamp, url, title, text, checksum, file_mtime)
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?)'
|
||||||
')
|
)
|
||||||
->execute(
|
->execute(
|
||||||
time(),
|
time(),
|
||||||
$path,
|
$url,
|
||||||
$title,
|
$title,
|
||||||
$text,
|
$text,
|
||||||
$checksum,
|
$checksum,
|
||||||
@@ -187,16 +197,20 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* PDF-Parsing + Cleanup
|
* PDF PARSING
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
|
|
||||||
private function parsePdf(string $absolutePath): string
|
private function parsePdf(string $absolutePath): string
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
$parser = new Parser();
|
$parser = new Parser();
|
||||||
$pdf = $parser->parseFile($absolutePath);
|
$pdf = $parser->parseFile($absolutePath);
|
||||||
|
|
||||||
$text = $this->cleanPdfContent($pdf->getText());
|
$text = $this->cleanPdfContent($pdf->getText());
|
||||||
|
|
||||||
|
// Begrenzen (Performance + Relevanz)
|
||||||
return mb_substr($text, 0, 5000);
|
return mb_substr($text, 0, 5000);
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
error_log('PDF Parser FEHLER: ' . $e->getMessage());
|
error_log('PDF Parser FEHLER: ' . $e->getMessage());
|
||||||
return '';
|
return '';
|
||||||
@@ -209,10 +223,19 @@ class PdfIndexService
|
|||||||
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sonderglyphen raus
|
||||||
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
||||||
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
|
|
||||||
$text = str_replace(["\\'", "’", "‘"], "'", $text);
|
// Worttrennungen reparieren
|
||||||
|
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
|
||||||
|
|
||||||
|
// Apostrophe normalisieren
|
||||||
|
$text = str_replace(["\\'", '’', '‘'], "'", $text);
|
||||||
|
|
||||||
|
// Mehrfache Satzzeichen
|
||||||
$text = preg_replace('/([.,;:!?])\1+/', '$1', $text);
|
$text = preg_replace('/([.,;:!?])\1+/', '$1', $text);
|
||||||
|
|
||||||
|
// Whitespaces
|
||||||
$text = preg_replace('/\s+/u', ' ', $text);
|
$text = preg_replace('/\s+/u', ' ', $text);
|
||||||
|
|
||||||
return trim($text);
|
return trim($text);
|
||||||
|
|||||||
Reference in New Issue
Block a user