Bugfix
This commit is contained in:
@@ -132,13 +132,22 @@ class IndexPageListener
|
|||||||
private function findPdfLinks(string $content): array
|
private function findPdfLinks(string $content): array
|
||||||
{
|
{
|
||||||
if (!preg_match_all(
|
if (!preg_match_all(
|
||||||
'/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\']/i',
|
'/<a\s+[^>]*href=["\']([^"\']*(?:\.pdf|p=pdf(?:%2F|\/)[^"\']*))["\'][^>]*>(.*?)<\/a>/is',
|
||||||
$content,
|
$content,
|
||||||
$matches
|
$matches
|
||||||
)) {
|
)) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
return array_unique(array_map('html_entity_decode', $matches[1]));
|
$result = [];
|
||||||
|
|
||||||
|
foreach ($matches[1] as $i => $href) {
|
||||||
|
$result[] = [
|
||||||
|
'url' => html_entity_decode($href),
|
||||||
|
'linkText' => trim(strip_tags($matches[2][$i])) ?: null,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
+115
-38
@@ -8,71 +8,84 @@ use Symfony\Component\DependencyInjection\ParameterBag\ParameterBagInterface;
|
|||||||
|
|
||||||
class PdfIndexService
|
class PdfIndexService
|
||||||
{
|
{
|
||||||
private bool $tableReset = false;
|
|
||||||
private string $projectDir;
|
private string $projectDir;
|
||||||
|
private bool $crawlStarted = false;
|
||||||
|
|
||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
|
$this->projectDir = rtrim($params->get('kernel.project_dir'), '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/* =====================================================
|
||||||
* 🔥 Wird bei JEDEM Crawl einmal aufgerufen
|
* Crawl-Start (immer aufrufen!)
|
||||||
*/
|
* ===================================================== */
|
||||||
public function resetTableOnce(): void
|
public function startCrawl(): void
|
||||||
{
|
{
|
||||||
if ($this->tableReset) {
|
if ($this->crawlStarted) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
|
$this->crawlStarted = true;
|
||||||
error_log('tl_search_pdf wurde geleert');
|
|
||||||
|
|
||||||
$this->tableReset = true;
|
// bewusst simpel: bei JEDEM Crawl komplett leeren
|
||||||
|
Database::getInstance()->execute('TRUNCATE TABLE tl_search_pdf');
|
||||||
|
|
||||||
|
error_log('PDF Crawl gestartet → tl_search_pdf geleert');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/* =====================================================
|
||||||
* Einstiegspunkt vom Listener
|
* Einstiegspunkt aus IndexPageListener
|
||||||
*/
|
* ===================================================== */
|
||||||
public function handlePdfLinks(array $pdfLinks): void
|
public function handlePdfLinks(array $pdfLinks): void
|
||||||
{
|
{
|
||||||
foreach ($pdfLinks as $url) {
|
foreach ($pdfLinks as $pdf) {
|
||||||
try {
|
try {
|
||||||
$path = $this->normalizePdfUrl($url);
|
$url = $pdf['url'];
|
||||||
if ($path === null) {
|
$linkText = $pdf['linkText'] ?? null;
|
||||||
|
|
||||||
|
error_log('bearbeite PDF: ' . $url);
|
||||||
|
|
||||||
|
$relativePath = $this->normalizePdfUrl($url);
|
||||||
|
if ($relativePath === null) {
|
||||||
|
error_log('→ übersprungen: kein gültiger PDF-Pfad');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$absolutePath = $this->projectDir . '/' . ltrim($path, '/');
|
$absolutePath = $this->projectDir . '/' . ltrim($relativePath, '/');
|
||||||
if (!is_file($absolutePath)) {
|
if (!is_file($absolutePath)) {
|
||||||
|
error_log('→ übersprungen: Datei existiert nicht');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$parser = new Parser();
|
$mtime = filemtime($absolutePath) ?: 0;
|
||||||
$pdf = $parser->parseFile($absolutePath);
|
$checksum = md5($relativePath . $mtime);
|
||||||
$text = $this->cleanPdfContent($pdf->getText());
|
|
||||||
|
// PDF parsen
|
||||||
|
[$text, $metaTitle] = $this->parsePdf($absolutePath);
|
||||||
|
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
|
error_log('→ übersprungen: kein Textinhalt');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
Database::getInstance()
|
// TITEL-PRIORITÄT
|
||||||
->prepare('
|
$title =
|
||||||
INSERT INTO tl_search_pdf
|
$linkText
|
||||||
(tstamp, url, title, text, checksum, file_mtime)
|
?: $metaTitle
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
?: basename($absolutePath);
|
||||||
')
|
|
||||||
->execute(
|
$this->insertPdf(
|
||||||
time(),
|
$relativePath,
|
||||||
$path,
|
$title,
|
||||||
basename($absolutePath),
|
$text,
|
||||||
mb_substr($text, 0, 5000),
|
$checksum,
|
||||||
md5($path),
|
$mtime
|
||||||
filemtime($absolutePath) ?: 0
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
error_log('→ geschrieben in tl_search_pdf');
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
error_log('PDF Fehler: ' . $e->getMessage());
|
error_log('PDF Service FEHLER: ' . $e->getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,18 +95,22 @@ class PdfIndexService
|
|||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function normalizePdfUrl(string $url): ?string
|
private function normalizePdfUrl(string $url): ?string
|
||||||
{
|
{
|
||||||
if (str_starts_with($url, '/files/') && str_ends_with($url, '.pdf')) {
|
// direkter /files-Link
|
||||||
|
if (str_starts_with($url, '/files/') && str_ends_with(strtolower($url), '.pdf')) {
|
||||||
return $url;
|
return $url;
|
||||||
}
|
}
|
||||||
|
|
||||||
$parts = parse_url(html_entity_decode($url));
|
// Contao-Download-Link (?p=)
|
||||||
|
$decoded = html_entity_decode($url);
|
||||||
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
if (!isset($parts['query'])) {
|
if (!isset($parts['query'])) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
parse_str($parts['query'], $query);
|
parse_str($parts['query'], $query);
|
||||||
|
|
||||||
if (!empty($query['p'])) {
|
if (!empty($query['p']) && str_ends_with(strtolower($query['p']), '.pdf')) {
|
||||||
return '/files/' . ltrim($query['p'], '/');
|
return '/files/' . ltrim($query['p'], '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,16 +118,76 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* =====================================================
|
/* =====================================================
|
||||||
* Textbereinigung
|
* DB
|
||||||
|
* ===================================================== */
|
||||||
|
private function insertPdf(
|
||||||
|
string $url,
|
||||||
|
string $title,
|
||||||
|
string $text,
|
||||||
|
string $checksum,
|
||||||
|
int $mtime
|
||||||
|
): void {
|
||||||
|
Database::getInstance()
|
||||||
|
->prepare('
|
||||||
|
INSERT INTO tl_search_pdf
|
||||||
|
(tstamp, url, title, text, checksum, file_mtime)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
')
|
||||||
|
->execute(
|
||||||
|
time(),
|
||||||
|
$url,
|
||||||
|
$title,
|
||||||
|
$text,
|
||||||
|
$checksum,
|
||||||
|
$mtime
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* =====================================================
|
||||||
|
* PDF Parsing
|
||||||
|
* ===================================================== */
|
||||||
|
private function parsePdf(string $absolutePath): array
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
$parser = new Parser();
|
||||||
|
$pdf = $parser->parseFile($absolutePath);
|
||||||
|
|
||||||
|
$details = $pdf->getDetails();
|
||||||
|
$metaTitle = $details['Title'] ?? null;
|
||||||
|
|
||||||
|
$text = $this->cleanPdfContent($pdf->getText());
|
||||||
|
|
||||||
|
return [
|
||||||
|
mb_substr($text, 0, 5000),
|
||||||
|
is_string($metaTitle) && trim($metaTitle) !== '' ? trim($metaTitle) : null,
|
||||||
|
];
|
||||||
|
|
||||||
|
} catch (\Throwable $e) {
|
||||||
|
error_log('PDF Parser FEHLER: ' . $e->getMessage());
|
||||||
|
return ['', null];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* =====================================================
|
||||||
|
* Text-Bereinigung
|
||||||
* ===================================================== */
|
* ===================================================== */
|
||||||
private function cleanPdfContent(string $text): string
|
private function cleanPdfContent(string $text): string
|
||||||
{
|
{
|
||||||
|
// Unicode normalisieren
|
||||||
if (class_exists(\Normalizer::class)) {
|
if (class_exists(\Normalizer::class)) {
|
||||||
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
$text = \Normalizer::normalize($text, \Normalizer::FORM_C);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sonderglyphen entfernen (Noten, Steuerzeichen etc.)
|
||||||
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
$text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text);
|
||||||
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text);
|
|
||||||
|
// falsche Worttrennungen ("ges pielt")
|
||||||
|
$text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', '', $text);
|
||||||
|
|
||||||
|
// Apostrophe vereinheitlichen
|
||||||
|
$text = str_replace(["\\'", "’", "‘"], "'", $text);
|
||||||
|
|
||||||
|
// Mehrfach-Leerzeichen
|
||||||
$text = preg_replace('/\s+/u', ' ', $text);
|
$text = preg_replace('/\s+/u', ' ', $text);
|
||||||
|
|
||||||
return trim($text);
|
return trim($text);
|
||||||
|
|||||||
Reference in New Issue
Block a user