Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c223ae692f | |||
| b4cd9199c8 | |||
| 6329c9e790 | |||
| d2c9263755 | |||
| e9f06f7cc9 | |||
| 6d2f4458bc | |||
| 9adad9ca8d | |||
| 356b18c8c8 | |||
| 7dc30c435f | |||
| ac001fb53c | |||
| 6ea558bbca | |||
| cf0a84b85e | |||
| d9b8646835 | |||
| b684267541 | |||
| 0e20a813af | |||
| c2a01d66f8 |
@@ -0,0 +1,74 @@
|
||||
# Contao Meilisearch Bundle
|
||||
|
||||
Eine schlanke Schnittstelle zwischen **Contao CMS (4.13 / 5.6 / 5.7 ready) unter PHP 8.4** und einer **selbst gehosteten Meilisearch-Instanz**.
|
||||
Das Bundle erweitert den Contao-Suchindex um strukturierte Daten und ermöglicht eine performante, moderne Volltextsuche.
|
||||
|
||||
---
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- Integration von **Meilisearch** als externe Suchmaschine
|
||||
- Indexierung von:
|
||||
- Contao-Seiten
|
||||
- Inhaltselementen
|
||||
- **PDF-Dateien**
|
||||
- **Office-Dokumenten** (DOCX, XLSX, PPTX)
|
||||
- Unterstützung für:
|
||||
- Seiten-Prioritäten
|
||||
- Keywords
|
||||
- Vorschaubild
|
||||
- Kompatibel mit:
|
||||
- Contao **4.13**, **5.6** und **5.7**
|
||||
- PHP **8.4**
|
||||
|
||||
---
|
||||
|
||||
## ⏱️ Scheduled Indexing (Cron setup)
|
||||
|
||||
Das Bundle stellt eigene Commands zur Verfügung, um Dateien zu bereinigen und den Meilisearch-Index neu aufzubauen.
|
||||
Für den produktiven Einsatz wird empfohlen, diese Commands regelmäßig per **System-Crontab** auszuführen.
|
||||
|
||||
Das Bundle nutzt **keinen eigenen Contao-Cron**, sondern System-Cronjobs.
|
||||
|
||||
## Verfügbare Commands
|
||||
|
||||
### Datei-Cleanup
|
||||
|
||||
```
|
||||
/vendor/bin/contao-console meilisearch:files:cleanup
|
||||
```
|
||||
|
||||
### Meilisearch-Index
|
||||
|
||||
```
|
||||
/vendor/bin/contao-console meilisearch:index
|
||||
```
|
||||
|
||||
## Empfohlene Reihenfolge
|
||||
|
||||
1. Datei-Cleanup
|
||||
`/vendor/bin/contao-console meilisearch:files:cleanup`
|
||||
|
||||
2. Contao-Crawl (ca. 1 Minute später)
|
||||
`/vendor/bin/contao-console contao:crawl`
|
||||
|
||||
3. Meilisearch-Index (ca. 15 Minuten später)
|
||||
`/vendor/bin/contao-console meilisearch:index`
|
||||
|
||||
## Beispiel Crontab
|
||||
|
||||
```
|
||||
0 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:files:cleanup
|
||||
1 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console contao:crawl
|
||||
15 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:index
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
```
|
||||
>> var/logs/meilisearch_cron.log 2>&1
|
||||
```
|
||||
|
||||
## Lizenz
|
||||
|
||||
MIT
|
||||
@@ -0,0 +1,106 @@
|
||||
<?php
|
||||
|
||||
namespace MummertMedia\ContaoMeilisearchBundle\Command;
|
||||
|
||||
use Contao\CoreBundle\Framework\ContaoFramework;
|
||||
use Contao\Database;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
|
||||
class MeilisearchFilesCleanupCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private readonly ContaoFramework $framework,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->setName('meilisearch:files:cleanup')
|
||||
->setDescription('Remove stale indexed files (PDF, DOCX, XLSX, PPTX) from tl_search_pdf')
|
||||
->addOption(
|
||||
'grace',
|
||||
null,
|
||||
InputOption::VALUE_OPTIONAL,
|
||||
'Grace period in seconds (files newer than now-grace are kept)',
|
||||
86400 // 24 Stunden
|
||||
)
|
||||
->addOption(
|
||||
'dry-run',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Show how many entries would be removed without deleting them'
|
||||
);
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->framework->initialize();
|
||||
|
||||
$this->log('Cleaner gestartet');
|
||||
|
||||
try {
|
||||
$grace = max(0, (int) $input->getOption('grace'));
|
||||
$dryRun = (bool) $input->getOption('dry-run');
|
||||
$cutoff = time() - $grace;
|
||||
|
||||
if ($dryRun) {
|
||||
$count = Database::getInstance()
|
||||
->prepare('SELECT COUNT(*) AS cnt FROM tl_search_pdf WHERE last_seen < ?')
|
||||
->execute($cutoff)
|
||||
->cnt;
|
||||
|
||||
$message = sprintf(
|
||||
'[DRY-RUN] %d stale file(s) would be removed (last_seen < %s)',
|
||||
$count,
|
||||
date('Y-m-d H:i:s', $cutoff)
|
||||
);
|
||||
|
||||
$output->writeln('<comment>' . $message . '</comment>');
|
||||
$this->log($message);
|
||||
|
||||
$this->log('Cleaner successfully stopped');
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
$affected = Database::getInstance()
|
||||
->prepare('DELETE FROM tl_search_pdf WHERE last_seen < ?')
|
||||
->execute($cutoff)
|
||||
->affectedRows;
|
||||
|
||||
$message = sprintf(
|
||||
'Removed %d stale file(s) (last_seen < %s)',
|
||||
$affected,
|
||||
date('Y-m-d H:i:s', $cutoff)
|
||||
);
|
||||
|
||||
$output->writeln('<info>' . $message . '</info>');
|
||||
$this->log($message);
|
||||
|
||||
$this->log('Cleaner successfully stopped');
|
||||
return Command::SUCCESS;
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
$this->log('Cleaner ERROR: ' . $e->getMessage());
|
||||
$output->writeln('<error>' . $e->getMessage() . '</error>');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Einheitliches Logging mit Zeitstempel
|
||||
*/
|
||||
private function log(string $message): void
|
||||
{
|
||||
error_log(sprintf(
|
||||
'[%s] %s',
|
||||
date('Y-m-d H:i:s'),
|
||||
$message
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -24,12 +24,34 @@ class MeilisearchIndexCommand extends Command
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$this->log('Meilisearch index gestartet');
|
||||
$output->writeln('<info>Meilisearch index started</info>');
|
||||
|
||||
try {
|
||||
$this->indexService->run();
|
||||
|
||||
$this->log('Meilisearch index successfully stopped');
|
||||
$output->writeln('<info>Meilisearch index finished</info>');
|
||||
|
||||
return Command::SUCCESS;
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
$this->log('Meilisearch index ERROR: ' . $e->getMessage());
|
||||
$output->writeln('<error>' . $e->getMessage() . '</error>');
|
||||
|
||||
return Command::FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Einheitliches Logging mit Zeitstempel
|
||||
*/
|
||||
private function log(string $message): void
|
||||
{
|
||||
error_log(sprintf(
|
||||
'[%s] %s',
|
||||
date('Y-m-d H:i:s'),
|
||||
$message
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
<?php
|
||||
namespace MummertMedia\ContaoMeilisearchBundle\Cron;
|
||||
|
||||
use Contao\CoreBundle\Framework\ContaoFramework;
|
||||
use MummertMedia\ContaoMeilisearchBundle\Service\MeilisearchIndexService;
|
||||
|
||||
class MeilisearchIndexCron
|
||||
{
|
||||
public function __construct(
|
||||
private readonly MeilisearchIndexService $indexService,
|
||||
private readonly ContaoFramework $framework,
|
||||
) {}
|
||||
|
||||
public function __invoke(): void
|
||||
{
|
||||
// Contao initialisieren (wichtig!)
|
||||
$this->framework->initialize();
|
||||
|
||||
// einmal täglich indexieren
|
||||
$this->indexService->run();
|
||||
}
|
||||
}
|
||||
@@ -30,22 +30,6 @@ class IndexPageListener
|
||||
'set_keys' => array_keys($set),
|
||||
]);
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* PDF: Reset genau 1× pro Crawl
|
||||
* =====================
|
||||
*/
|
||||
try {
|
||||
$this->debug('PDF resetTableOnce(): call');
|
||||
$this->pdfIndexService->resetTableOnce();
|
||||
$this->debug('PDF resetTableOnce(): ok');
|
||||
} catch (\Throwable $e) {
|
||||
$this->debug('PDF resetTableOnce(): failed', [
|
||||
'error' => $e->getMessage(),
|
||||
'class' => $e::class,
|
||||
]);
|
||||
}
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* SEITEN-METADATEN
|
||||
|
||||
@@ -3,22 +3,22 @@ services:
|
||||
Psr\Container\ContainerInterface: '@service_container'
|
||||
|
||||
MummertMedia\ContaoMeilisearchBundle\:
|
||||
resource: '../../{Command,Cron,EventListener,Service}'
|
||||
resource: '../../{Command,EventListener,Service}'
|
||||
autowire: true
|
||||
autoconfigure: true
|
||||
|
||||
MummertMedia\ContaoMeilisearchBundle\EventListener\MeilisearchPageMarkerListener:
|
||||
autowire: true
|
||||
autoconfigure: false
|
||||
tags:
|
||||
- { name: contao.hook, hook: outputFrontendTemplate, method: onOutputFrontendTemplate }
|
||||
|
||||
MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener:
|
||||
autowire: true
|
||||
autoconfigure: false
|
||||
tags:
|
||||
- { name: contao.hook, hook: indexPage, method: onIndexPage }
|
||||
|
||||
MummertMedia\ContaoMeilisearchBundle\Cron\MeilisearchIndexCron:
|
||||
autowire: true
|
||||
autoconfigure: false
|
||||
tags:
|
||||
- { name: contao.cron, interval: daily, method: __invoke }
|
||||
|
||||
MummertMedia\ContaoMeilisearchBundle\Controller\FrontendModule\MeilisearchSearchController:
|
||||
autowire: true
|
||||
autoconfigure: false
|
||||
|
||||
@@ -8,10 +8,11 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
|
||||
'sql' => [
|
||||
'keys' => [
|
||||
'id' => 'primary',
|
||||
'checksum' => 'unique',
|
||||
'page_id' => 'index',
|
||||
'url' => 'index',
|
||||
'type' => 'index', // ⬅️ NEU
|
||||
'url' => 'unique',
|
||||
'type' => 'index',
|
||||
'checksum' => 'index',
|
||||
'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance)
|
||||
],
|
||||
],
|
||||
],
|
||||
@@ -25,10 +26,18 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
|
||||
'sql' => "int(10) unsigned NOT NULL default 0",
|
||||
],
|
||||
|
||||
/*
|
||||
* Zeitpunkt, wann die Datei zuletzt beim Crawl gesehen wurde
|
||||
* → Basis für Cleanup
|
||||
*/
|
||||
'last_seen' => [ // ⬅️ NEU
|
||||
'sql' => "int(10) unsigned NOT NULL default 0",
|
||||
],
|
||||
|
||||
/*
|
||||
* Dateityp: pdf | docx | xlsx | pptx
|
||||
*/
|
||||
'type' => [ // ⬅️ NEU
|
||||
'type' => [
|
||||
'sql' => "varchar(16) NOT NULL default 'pdf'",
|
||||
],
|
||||
|
||||
@@ -64,7 +73,7 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
|
||||
|
||||
/*
|
||||
* Herkunftsseite (tl_page.id)
|
||||
* → Cleanup / Referenz
|
||||
* → optional, Debug / Referenz
|
||||
*/
|
||||
'page_id' => [
|
||||
'sql' => "int(10) unsigned NOT NULL default 0",
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
Time,Source,URI,"Found on URI","Found on level",Tags,Message
|
||||
"2025-12-28 10:44:12.357228","Contao\CoreBundle\Crawl\Escargot\Subscriber\SearchIndexSubscriber",https://flowerpowerabi.de/,https://flowerpowerabi.de/sitemap.xml,3,,"Forwarded to the search indexer. Was indexed successfully."
|
||||
"2025-12-28 10:44:12.413689","Contao\CoreBundle\Crawl\Escargot\Subscriber\SearchIndexSubscriber",https://flowerpowerabi.de/testseite-fuer-meilisearch,https://flowerpowerabi.de/sitemap.xml,3,,"Forwarded to the search indexer. Was indexed successfully."
|
||||
|
@@ -12,9 +12,6 @@ class OfficeIndexService
|
||||
{
|
||||
private string $projectDir;
|
||||
|
||||
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
|
||||
private array $seenThisCrawl = [];
|
||||
|
||||
public function __construct(ParameterBagInterface $params)
|
||||
{
|
||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||
@@ -25,6 +22,10 @@ class OfficeIndexService
|
||||
*/
|
||||
public function handleOfficeLinks(array $officeLinks): void
|
||||
{
|
||||
// Dedupe nur pro Aufruf (nicht "pro Crawl")
|
||||
$seen = [];
|
||||
$now = time();
|
||||
|
||||
foreach ($officeLinks as $row) {
|
||||
$url = (string) ($row['url'] ?? '');
|
||||
$linkText = $row['linkText'] ?? null;
|
||||
@@ -33,13 +34,12 @@ class OfficeIndexService
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
|
||||
// doppelte URLs pro Aufruf vermeiden
|
||||
$seenKey = md5($url);
|
||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
||||
if (isset($seen[$seenKey])) {
|
||||
continue;
|
||||
}
|
||||
$this->seenThisCrawl[$seenKey] = true;
|
||||
$seen[$seenKey] = true;
|
||||
|
||||
$normalized = $this->normalizeOfficeUrl($url);
|
||||
if ($normalized === null) {
|
||||
@@ -56,27 +56,37 @@ class OfficeIndexService
|
||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||
$checksum = md5($relativePath . '|' . $mtime);
|
||||
|
||||
$title = $linkText ?: basename($absolutePath);
|
||||
// existiert bereits?
|
||||
$existing = Database::getInstance()
|
||||
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
|
||||
->execute($relativePath)
|
||||
->fetchAssoc();
|
||||
|
||||
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
|
||||
|
||||
// Titel-Priorität:
|
||||
// 1) Linktext
|
||||
// 2) Dateiname
|
||||
$title = $linkText ?: basename($absolutePath);
|
||||
$text = '';
|
||||
|
||||
if ($needsParse) {
|
||||
$text = $this->parseOfficeFile($absolutePath, $type);
|
||||
if ($text === '') {
|
||||
// Parsing fehlgeschlagen → nichts überschreiben
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$this->upsertOffice(
|
||||
$relativePath,
|
||||
$title,
|
||||
$text,
|
||||
$text, // kann '' sein → SQL überschreibt dann nicht
|
||||
$checksum,
|
||||
$mtime,
|
||||
$type
|
||||
$type,
|
||||
$now
|
||||
);
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,6 +98,10 @@ class OfficeIndexService
|
||||
$decoded = html_entity_decode($url);
|
||||
$parts = parse_url($decoded);
|
||||
|
||||
if (!$parts) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 1) files/... (ohne führenden Slash)
|
||||
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
|
||||
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
||||
@@ -117,8 +131,8 @@ class OfficeIndexService
|
||||
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
||||
|
||||
if (
|
||||
str_starts_with($file, 'files/')
|
||||
&& in_array($ext, ['docx', 'xlsx', 'pptx'], true)
|
||||
str_starts_with($file, 'files/') &&
|
||||
in_array($ext, ['docx', 'xlsx', 'pptx'], true)
|
||||
) {
|
||||
return ['/' . $file, $ext];
|
||||
}
|
||||
@@ -148,25 +162,28 @@ class OfficeIndexService
|
||||
string $text,
|
||||
string $checksum,
|
||||
int $mtime,
|
||||
string $type
|
||||
string $type,
|
||||
int $now
|
||||
): void {
|
||||
try {
|
||||
Database::getInstance()
|
||||
->prepare('
|
||||
INSERT INTO tl_search_pdf
|
||||
(tstamp, type, url, title, text, checksum, file_mtime)
|
||||
(tstamp, last_seen, type, url, title, text, checksum, file_mtime)
|
||||
VALUES
|
||||
(?, ?, ?, ?, ?, ?, ?)
|
||||
(?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
tstamp = VALUES(tstamp),
|
||||
last_seen = VALUES(last_seen),
|
||||
type = VALUES(type),
|
||||
url = VALUES(url),
|
||||
title = VALUES(title),
|
||||
text=VALUES(text),
|
||||
file_mtime=VALUES(file_mtime)
|
||||
checksum = VALUES(checksum),
|
||||
file_mtime = VALUES(file_mtime),
|
||||
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
|
||||
')
|
||||
->execute(
|
||||
time(),
|
||||
$now,
|
||||
$now,
|
||||
$type,
|
||||
$url,
|
||||
$title,
|
||||
@@ -174,11 +191,6 @@ class OfficeIndexService
|
||||
$checksum,
|
||||
$mtime
|
||||
);
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private function parseOfficeFile(string $absolutePath, string $type): string
|
||||
@@ -206,10 +218,7 @@ class OfficeIndexService
|
||||
}
|
||||
|
||||
return $this->cleanText($text);
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
|
||||
);
|
||||
} catch (\Throwable) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
@@ -227,10 +236,7 @@ class OfficeIndexService
|
||||
}
|
||||
|
||||
return $this->cleanText($text);
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
|
||||
);
|
||||
} catch (\Throwable) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
@@ -250,10 +256,7 @@ class OfficeIndexService
|
||||
}
|
||||
|
||||
return $this->cleanText($text);
|
||||
} catch (\Throwable $e) {
|
||||
error_log(
|
||||
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
|
||||
);
|
||||
} catch (\Throwable) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,34 +10,20 @@ class PdfIndexService
|
||||
{
|
||||
private string $projectDir;
|
||||
|
||||
private bool $didReset = false;
|
||||
private array $seenThisCrawl = [];
|
||||
|
||||
public function __construct(ParameterBagInterface $params)
|
||||
{
|
||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||
}
|
||||
|
||||
/**
|
||||
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
|
||||
*/
|
||||
public function resetTableOnce(): void
|
||||
{
|
||||
if ($this->didReset) {
|
||||
return;
|
||||
}
|
||||
|
||||
$this->didReset = true;
|
||||
$this->seenThisCrawl = [];
|
||||
|
||||
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
|
||||
*/
|
||||
public function handlePdfLinks(array $pdfLinks): void
|
||||
{
|
||||
// Dedupe nur pro Aufruf (nicht "pro Crawl")
|
||||
$seen = [];
|
||||
$now = time();
|
||||
|
||||
foreach ($pdfLinks as $row) {
|
||||
$url = (string) ($row['url'] ?? '');
|
||||
$linkText = $row['linkText'] ?? null;
|
||||
@@ -46,12 +32,12 @@ class PdfIndexService
|
||||
continue;
|
||||
}
|
||||
|
||||
// innerhalb eines Crawls doppelte URLs vermeiden
|
||||
// doppelte URLs pro Aufruf vermeiden
|
||||
$seenKey = md5($url);
|
||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
||||
if (isset($seen[$seenKey])) {
|
||||
continue;
|
||||
}
|
||||
$this->seenThisCrawl[$seenKey] = true;
|
||||
$seen[$seenKey] = true;
|
||||
|
||||
$normalizedPath = $this->normalizePdfUrl($url);
|
||||
if ($normalizedPath === null) {
|
||||
@@ -66,24 +52,39 @@ class PdfIndexService
|
||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||
$checksum = md5($normalizedPath . '|' . $mtime);
|
||||
|
||||
// existiert bereits?
|
||||
$existing = Database::getInstance()
|
||||
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
|
||||
->execute($normalizedPath)
|
||||
->fetchAssoc();
|
||||
|
||||
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
|
||||
|
||||
// Titel-Priorität:
|
||||
// 1) Linktext
|
||||
// 2) PDF-Metadaten
|
||||
// 3) Dateiname
|
||||
$title = $linkText ?: basename($absolutePath);
|
||||
$text = '';
|
||||
|
||||
if ($needsParse) {
|
||||
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
||||
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
||||
|
||||
$text = $this->parsePdf($absolutePath);
|
||||
if ($text === '') {
|
||||
// wenn parsing fehlschlägt, NICHT überschreiben
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$this->upsertPdf(
|
||||
$normalizedPath,
|
||||
$title,
|
||||
$text,
|
||||
$text, // kann '' sein → wird in SQL nicht überschrieben
|
||||
$checksum,
|
||||
$mtime
|
||||
$mtime,
|
||||
$now
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -93,6 +94,10 @@ class PdfIndexService
|
||||
$decoded = html_entity_decode($url);
|
||||
$parts = parse_url($decoded);
|
||||
|
||||
if (!$parts) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// 1) files/...pdf (ohne führenden Slash)
|
||||
if (
|
||||
!empty($parts['path'])
|
||||
@@ -149,23 +154,29 @@ class PdfIndexService
|
||||
string $title,
|
||||
string $text,
|
||||
string $checksum,
|
||||
int $mtime
|
||||
int $mtime,
|
||||
int $now
|
||||
): void {
|
||||
Database::getInstance()
|
||||
->prepare('
|
||||
INSERT INTO tl_search_pdf
|
||||
(tstamp, url, title, text, checksum, file_mtime)
|
||||
(tstamp, last_seen, type, url, title, text, checksum, file_mtime)
|
||||
VALUES
|
||||
(?, ?, ?, ?, ?, ?)
|
||||
(?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
tstamp = VALUES(tstamp),
|
||||
last_seen = VALUES(last_seen),
|
||||
type = VALUES(type),
|
||||
url = VALUES(url),
|
||||
title = VALUES(title),
|
||||
text=VALUES(text),
|
||||
file_mtime=VALUES(file_mtime)
|
||||
checksum = VALUES(checksum),
|
||||
file_mtime = VALUES(file_mtime),
|
||||
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
|
||||
')
|
||||
->execute(
|
||||
time(),
|
||||
$now,
|
||||
$now,
|
||||
'pdf',
|
||||
$url,
|
||||
$title,
|
||||
$text,
|
||||
@@ -203,6 +214,7 @@ class PdfIndexService
|
||||
}
|
||||
}
|
||||
} catch (\Throwable) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user