Files
meilisearch-bundle/src/Service/PdfIndexService.php
T
Jürgen Mummert e3c29d70e9 Bugfix
2025-12-25 14:41:20 +01:00

178 lines
5.3 KiB
PHP

<?php
namespace MummertMedia\ContaoMeilisearchBundle\Service;
use Contao\Database;
use Contao\StringUtil;
use Smalot\PdfParser\Parser;
class PdfIndexService
{
public function handlePdfLinks(array $pdfLinks): void
{
error_log('PDF Service aufgerufen');
error_log('PDF Links Count: ' . count($pdfLinks));
error_log('PDF Links: ' . json_encode($pdfLinks, JSON_UNESCAPED_SLASHES));
foreach ($pdfLinks as $url) {
try {
error_log('bearbeite PDF: ' . $url);
$normalizedUrl = $this->normalizePdfUrl($url);
error_log('umgewandelte URL: ' . var_export($normalizedUrl, true));
if ($normalizedUrl === null) {
error_log('→ übersprungen: normalizePdfUrl() == null');
continue;
}
$absolutePath = $this->getAbsolutePath($normalizedUrl);
error_log('absoluter Pfad: ' . var_export($absolutePath, true));
if ($absolutePath === null || !is_file($absolutePath)) {
error_log('→ übersprungen: Datei nicht gefunden');
continue;
}
$mtime = filemtime($absolutePath) ?: 0;
$checksum = md5($normalizedUrl . $mtime);
if ($this->alreadyIndexed($checksum)) {
error_log('→ übersprungen: bereits indexiert');
continue;
}
$title = basename($absolutePath);
error_log('gefundener Title: ' . $title);
$text = $this->parsePdf($absolutePath);
if ($text === '') {
error_log('→ übersprungen: parsePdf() leer');
continue;
}
$this->insertPdf($normalizedUrl, $title, $text, $checksum, $mtime);
error_log('geschrieben in tl_search_pdf');
} catch (\Throwable $e) {
error_log('PDF Service FEHLER (pro PDF): ' . $e->getMessage());
error_log($e->getTraceAsString());
}
}
}
/* =====================================================
* PDF-Parsing
* ===================================================== */
private function parsePdf(string $absolutePath): string
{
try {
$parser = new Parser();
$pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 5000);
} catch (\Throwable $e) {
error_log('→ Fehler beim Parsen der PDF: ' . $e->getMessage());
return '';
}
}
private function cleanPdfContent(string $content): string
{
$content = StringUtil::decodeEntities($content);
$content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content);
$content = preg_replace('/\s+/u', ' ', $content);
return trim($content);
}
/* =====================================================
* DB
* ===================================================== */
private function alreadyIndexed(string $checksum): bool
{
$result = Database::getInstance()
->prepare('SELECT id FROM tl_search_pdf WHERE checksum=?')
->execute($checksum);
return $result->numRows > 0;
}
private function insertPdf(
string $url,
string $title,
string $text,
string $checksum,
int $mtime
): void {
Database::getInstance()
->prepare('
INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime)
VALUES
(?, ?, ?, ?, ?, ?)
')
->execute(
time(),
$url,
$title,
$text,
$checksum,
$mtime
);
}
/* =====================================================
* URL & Pfad-Helfer
* ===================================================== */
private function normalizePdfUrl(string $url): ?string
{
$url = html_entity_decode($url);
// 1) direkter /files/*.pdf-Link (immer korrekt)
$path = parse_url($url, PHP_URL_PATH);
if ($path && preg_match('~^/files/.*\.pdf$~i', $path)) {
return $path;
}
// 2) Query-Parameter prüfen
$query = parse_url($url, PHP_URL_QUERY);
if (!$query) {
return null;
}
parse_str($query, $params);
// 2a) Contao p=pdf/xyz.pdf
if (!empty($params['p']) && preg_match('~\.pdf$~i', $params['p'])) {
return '/files/' . ltrim($params['p'], '/');
}
// 2b) Contao Download: f=Dateiname → Dateisystem suchen
if (!empty($params['f'])) {
$file = basename($params['f']);
// Suche im /files-Verzeichnis (rekursiv, aber schnell genug)
$matches = glob(TL_ROOT . '/files/**/' . $file, GLOB_BRACE);
if (!empty($matches)) {
return str_replace(TL_ROOT, '', $matches[0]);
}
}
return null;
}
private function getAbsolutePath(string $url): ?string
{
if (!str_starts_with($url, '/files/')) {
return null;
}
return TL_ROOT . $url;
}
}