15 Commits

Author SHA1 Message Date
Jürgen Mummert f16e7a98d1 Fix duplicate Meilisearch marker injection 2026-01-09 11:03:41 +01:00
Jürgen Mummert c223ae692f add logging 2026-01-09 10:32:05 +01:00
Jürgen Mummert b4cd9199c8 add logging 2026-01-09 10:17:57 +01:00
Jürgen Mummert 6329c9e790 remove cron 2026-01-09 09:52:22 +01:00
Jürgen Mummert d2c9263755 add logging to cron 2026-01-09 09:40:08 +01:00
Jürgen Mummert e9f06f7cc9 services.yml change 2026-01-06 09:07:13 +01:00
Jürgen Mummert 6d2f4458bc add cron 2026-01-05 11:28:02 +01:00
Jürgen Mummert 9adad9ca8d add cron 2026-01-05 11:19:09 +01:00
Jürgen Mummert 356b18c8c8 add cron 2026-01-05 11:13:11 +01:00
Jürgen Mummert 7dc30c435f add cron 2026-01-05 11:05:35 +01:00
Jürgen Mummert ac001fb53c change Grace period zu 24h 2026-01-05 10:43:16 +01:00
Jürgen Mummert 6ea558bbca remove table reset 2026-01-05 10:37:21 +01:00
Jürgen Mummert cf0a84b85e add last_seen 2026-01-05 10:29:09 +01:00
Jürgen Mummert d9b8646835 Change Delete Command 2026-01-05 10:25:58 +01:00
Jürgen Mummert b684267541 Add cleanup command for stale indexed files 2026-01-05 10:21:37 +01:00
10 changed files with 350 additions and 187 deletions
+47 -5
View File
@@ -20,13 +20,55 @@ Das Bundle erweitert den Contao-Suchindex um strukturierte Daten und ermöglicht
- Kompatibel mit: - Kompatibel mit:
- Contao **4.13**, **5.6** und **5.7** - Contao **4.13**, **5.6** und **5.7**
- PHP **8.4** - PHP **8.4**
- Entwickelt als **eigenständiges Contao-Bundle**
--- ---
## 📦 Installation ## ⏱️ Scheduled Indexing (Cron setup)
Installation über Composer: Das Bundle stellt eigene Commands zur Verfügung, um Dateien zu bereinigen und den Meilisearch-Index neu aufzubauen.
Für den produktiven Einsatz wird empfohlen, diese Commands regelmäßig per **System-Crontab** auszuführen.
```bash Das Bundle nutzt **keinen eigenen Contao-Cron**, sondern System-Cronjobs.
composer require mummertmedia/contao-meilisearch-bundle:^0.1
## Verfügbare Commands
### Datei-Cleanup
```
/vendor/bin/contao-console meilisearch:files:cleanup
```
### Meilisearch-Index
```
/vendor/bin/contao-console meilisearch:index
```
## Empfohlene Reihenfolge
1. Datei-Cleanup
`/vendor/bin/contao-console meilisearch:files:cleanup`
2. Contao-Crawl (ca. 1 Minute später)
`/vendor/bin/contao-console contao:crawl`
3. Meilisearch-Index (ca. 15 Minuten später)
`/vendor/bin/contao-console meilisearch:index`
## Beispiel Crontab
```
0 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:files:cleanup
1 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console contao:crawl
15 5 * * * /usr/bin/php8.4 /path/to/project/vendor/bin/contao-console meilisearch:index
```
## Logging
```
>> var/logs/meilisearch_cron.log 2>&1
```
## Lizenz
MIT
@@ -0,0 +1,106 @@
<?php
namespace MummertMedia\ContaoMeilisearchBundle\Command;
use Contao\CoreBundle\Framework\ContaoFramework;
use Contao\Database;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
class MeilisearchFilesCleanupCommand extends Command
{
public function __construct(
private readonly ContaoFramework $framework,
) {
parent::__construct();
}
protected function configure(): void
{
$this
->setName('meilisearch:files:cleanup')
->setDescription('Remove stale indexed files (PDF, DOCX, XLSX, PPTX) from tl_search_pdf')
->addOption(
'grace',
null,
InputOption::VALUE_OPTIONAL,
'Grace period in seconds (files newer than now-grace are kept)',
86400 // 24 Stunden
)
->addOption(
'dry-run',
null,
InputOption::VALUE_NONE,
'Show how many entries would be removed without deleting them'
);
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$this->framework->initialize();
$this->log('Cleaner gestartet');
try {
$grace = max(0, (int) $input->getOption('grace'));
$dryRun = (bool) $input->getOption('dry-run');
$cutoff = time() - $grace;
if ($dryRun) {
$count = Database::getInstance()
->prepare('SELECT COUNT(*) AS cnt FROM tl_search_pdf WHERE last_seen < ?')
->execute($cutoff)
->cnt;
$message = sprintf(
'[DRY-RUN] %d stale file(s) would be removed (last_seen < %s)',
$count,
date('Y-m-d H:i:s', $cutoff)
);
$output->writeln('<comment>' . $message . '</comment>');
$this->log($message);
$this->log('Cleaner successfully stopped');
return Command::SUCCESS;
}
$affected = Database::getInstance()
->prepare('DELETE FROM tl_search_pdf WHERE last_seen < ?')
->execute($cutoff)
->affectedRows;
$message = sprintf(
'Removed %d stale file(s) (last_seen < %s)',
$affected,
date('Y-m-d H:i:s', $cutoff)
);
$output->writeln('<info>' . $message . '</info>');
$this->log($message);
$this->log('Cleaner successfully stopped');
return Command::SUCCESS;
} catch (\Throwable $e) {
$this->log('Cleaner ERROR: ' . $e->getMessage());
$output->writeln('<error>' . $e->getMessage() . '</error>');
return Command::FAILURE;
}
}
/**
* Einheitliches Logging mit Zeitstempel
*/
private function log(string $message): void
{
error_log(sprintf(
'[%s] %s',
date('Y-m-d H:i:s'),
$message
));
}
}
+25 -3
View File
@@ -24,12 +24,34 @@ class MeilisearchIndexCommand extends Command
protected function execute(InputInterface $input, OutputInterface $output): int protected function execute(InputInterface $input, OutputInterface $output): int
{ {
$this->log('Meilisearch index gestartet');
$output->writeln('<info>Meilisearch index started</info>'); $output->writeln('<info>Meilisearch index started</info>');
$this->indexService->run(); try {
$this->indexService->run();
$output->writeln('<info>Meilisearch index finished</info>'); $this->log('Meilisearch index successfully stopped');
$output->writeln('<info>Meilisearch index finished</info>');
return Command::SUCCESS; return Command::SUCCESS;
} catch (\Throwable $e) {
$this->log('Meilisearch index ERROR: ' . $e->getMessage());
$output->writeln('<error>' . $e->getMessage() . '</error>');
return Command::FAILURE;
}
}
/**
* Einheitliches Logging mit Zeitstempel
*/
private function log(string $message): void
{
error_log(sprintf(
'[%s] %s',
date('Y-m-d H:i:s'),
$message
));
} }
} }
-22
View File
@@ -1,22 +0,0 @@
<?php
namespace MummertMedia\ContaoMeilisearchBundle\Cron;
use Contao\CoreBundle\Framework\ContaoFramework;
use MummertMedia\ContaoMeilisearchBundle\Service\MeilisearchIndexService;
class MeilisearchIndexCron
{
public function __construct(
private readonly MeilisearchIndexService $indexService,
private readonly ContaoFramework $framework,
) {}
public function __invoke(): void
{
// Contao initialisieren (wichtig!)
$this->framework->initialize();
// einmal täglich indexieren
$this->indexService->run();
}
}
-16
View File
@@ -30,22 +30,6 @@ class IndexPageListener
'set_keys' => array_keys($set), 'set_keys' => array_keys($set),
]); ]);
/*
* =====================
* PDF: Reset genau 1× pro Crawl
* =====================
*/
try {
$this->debug('PDF resetTableOnce(): call');
$this->pdfIndexService->resetTableOnce();
$this->debug('PDF resetTableOnce(): ok');
} catch (\Throwable $e) {
$this->debug('PDF resetTableOnce(): failed', [
'error' => $e->getMessage(),
'class' => $e::class,
]);
}
/* /*
* ===================== * =====================
* SEITEN-METADATEN * SEITEN-METADATEN
@@ -16,6 +16,13 @@ class MeilisearchPageMarkerListener
return $buffer; return $buffer;
} }
// ⛔ Marker bereits vorhanden → nichts mehr tun
if (str_contains($buffer, '⟦MEILISEARCH_META⟧')
|| str_contains($buffer, 'MEILISEARCH_JSON')
) {
return $buffer;
}
$data = []; $data = [];
/* /*
+7 -7
View File
@@ -3,22 +3,22 @@ services:
Psr\Container\ContainerInterface: '@service_container' Psr\Container\ContainerInterface: '@service_container'
MummertMedia\ContaoMeilisearchBundle\: MummertMedia\ContaoMeilisearchBundle\:
resource: '../../{Command,Cron,EventListener,Service}' resource: '../../{Command,EventListener,Service}'
autowire: true autowire: true
autoconfigure: true autoconfigure: true
MummertMedia\ContaoMeilisearchBundle\EventListener\MeilisearchPageMarkerListener:
autowire: true
autoconfigure: false
tags:
- { name: contao.hook, hook: outputFrontendTemplate, method: onOutputFrontendTemplate }
MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener: MummertMedia\ContaoMeilisearchBundle\EventListener\IndexPageListener:
autowire: true autowire: true
autoconfigure: false autoconfigure: false
tags: tags:
- { name: contao.hook, hook: indexPage, method: onIndexPage } - { name: contao.hook, hook: indexPage, method: onIndexPage }
MummertMedia\ContaoMeilisearchBundle\Cron\MeilisearchIndexCron:
autowire: true
autoconfigure: false
tags:
- { name: contao.cron, interval: daily, method: __invoke }
MummertMedia\ContaoMeilisearchBundle\Controller\FrontendModule\MeilisearchSearchController: MummertMedia\ContaoMeilisearchBundle\Controller\FrontendModule\MeilisearchSearchController:
autowire: true autowire: true
autoconfigure: false autoconfigure: false
+16 -7
View File
@@ -7,11 +7,12 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
'dataContainer' => DC_Table::class, 'dataContainer' => DC_Table::class,
'sql' => [ 'sql' => [
'keys' => [ 'keys' => [
'id' => 'primary', 'id' => 'primary',
'checksum' => 'unique', 'page_id' => 'index',
'page_id' => 'index', 'url' => 'unique',
'url' => 'index', 'type' => 'index',
'type' => 'index', // ⬅️ NEU 'checksum' => 'index',
'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance)
], ],
], ],
], ],
@@ -25,10 +26,18 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
'sql' => "int(10) unsigned NOT NULL default 0", 'sql' => "int(10) unsigned NOT NULL default 0",
], ],
/*
* Zeitpunkt, wann die Datei zuletzt beim Crawl gesehen wurde
* → Basis für Cleanup
*/
'last_seen' => [ // ⬅️ NEU
'sql' => "int(10) unsigned NOT NULL default 0",
],
/* /*
* Dateityp: pdf | docx | xlsx | pptx * Dateityp: pdf | docx | xlsx | pptx
*/ */
'type' => [ // ⬅️ NEU 'type' => [
'sql' => "varchar(16) NOT NULL default 'pdf'", 'sql' => "varchar(16) NOT NULL default 'pdf'",
], ],
@@ -64,7 +73,7 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
/* /*
* Herkunftsseite (tl_page.id) * Herkunftsseite (tl_page.id)
* → Cleanup / Referenz * → optional, Debug / Referenz
*/ */
'page_id' => [ 'page_id' => [
'sql' => "int(10) unsigned NOT NULL default 0", 'sql' => "int(10) unsigned NOT NULL default 0",
+86 -83
View File
@@ -12,9 +12,6 @@ class OfficeIndexService
{ {
private string $projectDir; private string $projectDir;
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
@@ -25,58 +22,71 @@ class OfficeIndexService
*/ */
public function handleOfficeLinks(array $officeLinks): void public function handleOfficeLinks(array $officeLinks): void
{ {
// Dedupe nur pro Aufruf (nicht "pro Crawl")
$seen = [];
$now = time();
foreach ($officeLinks as $row) { foreach ($officeLinks as $row) {
$url = (string) ($row['url'] ?? ''); $url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null; $linkText = $row['linkText'] ?? null;
if ($url === '') { if ($url === '') {
continue; continue;
} }
try { // doppelte URLs pro Aufruf vermeiden
// innerhalb des Crawls gleiche URL nicht mehrfach parsen $seenKey = md5($url);
$seenKey = md5($url); if (isset($seen[$seenKey])) {
if (isset($this->seenThisCrawl[$seenKey])) { continue;
continue; }
} $seen[$seenKey] = true;
$this->seenThisCrawl[$seenKey] = true;
$normalized = $this->normalizeOfficeUrl($url); $normalized = $this->normalizeOfficeUrl($url);
if ($normalized === null) { if ($normalized === null) {
continue; continue;
} }
[$relativePath, $type] = $normalized; [$relativePath, $type] = $normalized;
$absolutePath = $this->getAbsolutePath($relativePath); $absolutePath = $this->getAbsolutePath($relativePath);
if (!is_file($absolutePath)) { if (!is_file($absolutePath)) {
continue; continue;
} }
$mtime = (int) (filemtime($absolutePath) ?: 0); $mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($relativePath . '|' . $mtime); $checksum = md5($relativePath . '|' . $mtime);
$title = $linkText ?: basename($absolutePath); // existiert bereits?
$existing = Database::getInstance()
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
->execute($relativePath)
->fetchAssoc();
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
// Titel-Priorität:
// 1) Linktext
// 2) Dateiname
$title = $linkText ?: basename($absolutePath);
$text = '';
if ($needsParse) {
$text = $this->parseOfficeFile($absolutePath, $type); $text = $this->parseOfficeFile($absolutePath, $type);
if ($text === '') { if ($text === '') {
// Parsing fehlgeschlagen → nichts überschreiben
continue; continue;
} }
$this->upsertOffice(
$relativePath,
$title,
$text,
$checksum,
$mtime,
$type
);
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
);
} }
$this->upsertOffice(
$relativePath,
$title,
$text, // kann '' sein → SQL überschreibt dann nicht
$checksum,
$mtime,
$type,
$now
);
} }
} }
@@ -86,7 +96,11 @@ class OfficeIndexService
private function normalizeOfficeUrl(string $url): ?array private function normalizeOfficeUrl(string $url): ?array
{ {
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!$parts) {
return null;
}
// 1) files/... (ohne führenden Slash) // 1) files/... (ohne führenden Slash)
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) { if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
@@ -114,11 +128,11 @@ class OfficeIndexService
if (!empty($query['file'])) { if (!empty($query['file'])) {
$file = urldecode((string) $query['file']); $file = urldecode((string) $query['file']);
$file = ltrim($file, '/'); $file = ltrim($file, '/');
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
if ( if (
str_starts_with($file, 'files/') str_starts_with($file, 'files/') &&
&& in_array($ext, ['docx', 'xlsx', 'pptx'], true) in_array($ext, ['docx', 'xlsx', 'pptx'], true)
) { ) {
return ['/' . $file, $ext]; return ['/' . $file, $ext];
} }
@@ -126,7 +140,7 @@ class OfficeIndexService
// 4) Contao 5: ?p=... // 4) Contao 5: ?p=...
if (!empty($query['p'])) { if (!empty($query['p'])) {
$p = urldecode((string) $query['p']); $p = urldecode((string) $query['p']);
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) { if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
@@ -148,37 +162,35 @@ class OfficeIndexService
string $text, string $text,
string $checksum, string $checksum,
int $mtime, int $mtime,
string $type string $type,
int $now
): void { ): void {
try { Database::getInstance()
Database::getInstance() ->prepare('
->prepare(' INSERT INTO tl_search_pdf
INSERT INTO tl_search_pdf (tstamp, last_seen, type, url, title, text, checksum, file_mtime)
(tstamp, type, url, title, text, checksum, file_mtime) VALUES
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
(?, ?, ?, ?, ?, ?, ?) ON DUPLICATE KEY UPDATE
ON DUPLICATE KEY UPDATE tstamp = VALUES(tstamp),
tstamp=VALUES(tstamp), last_seen = VALUES(last_seen),
type=VALUES(type), type = VALUES(type),
url=VALUES(url), url = VALUES(url),
title=VALUES(title), title = VALUES(title),
text=VALUES(text), checksum = VALUES(checksum),
file_mtime=VALUES(file_mtime) file_mtime = VALUES(file_mtime),
') text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
->execute( ')
time(), ->execute(
$type, $now,
$url, $now,
$title, $type,
$text, $url,
$checksum, $title,
$mtime $text,
); $checksum,
} catch (\Throwable $e) { $mtime
error_log(
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
); );
}
} }
private function parseOfficeFile(string $absolutePath, string $type): string private function parseOfficeFile(string $absolutePath, string $type): string
@@ -206,10 +218,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -227,10 +236,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -250,10 +256,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
+56 -44
View File
@@ -10,48 +10,34 @@ class PdfIndexService
{ {
private string $projectDir; private string $projectDir;
private bool $didReset = false;
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
} }
/**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
*/
public function resetTableOnce(): void
{
if ($this->didReset) {
return;
}
$this->didReset = true;
$this->seenThisCrawl = [];
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
}
/** /**
* @param array<int,array{url:string,linkText:?string}> $pdfLinks * @param array<int,array{url:string,linkText:?string}> $pdfLinks
*/ */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
// Dedupe nur pro Aufruf (nicht "pro Crawl")
$seen = [];
$now = time();
foreach ($pdfLinks as $row) { foreach ($pdfLinks as $row) {
$url = (string) ($row['url'] ?? ''); $url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null; $linkText = $row['linkText'] ?? null;
if ($url === '') { if ($url === '') {
continue; continue;
} }
// innerhalb eines Crawls doppelte URLs vermeiden // doppelte URLs pro Aufruf vermeiden
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($seen[$seenKey])) {
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $seen[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
if ($normalizedPath === null) { if ($normalizedPath === null) {
@@ -63,27 +49,42 @@ class PdfIndexService
continue; continue;
} }
$mtime = (int) (filemtime($absolutePath) ?: 0); $mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($normalizedPath . '|' . $mtime); $checksum = md5($normalizedPath . '|' . $mtime);
// existiert bereits?
$existing = Database::getInstance()
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
->execute($normalizedPath)
->fetchAssoc();
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
// Titel-Priorität: // Titel-Priorität:
// 1) Linktext // 1) Linktext
// 2) PDF-Metadaten // 2) PDF-Metadaten
// 3) Dateiname // 3) Dateiname
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); $title = $linkText ?: basename($absolutePath);
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); $text = '';
$text = $this->parsePdf($absolutePath); if ($needsParse) {
if ($text === '') { $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
continue; $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
$text = $this->parsePdf($absolutePath);
if ($text === '') {
// wenn parsing fehlschlägt, NICHT überschreiben
continue;
}
} }
$this->upsertPdf( $this->upsertPdf(
$normalizedPath, $normalizedPath,
$title, $title,
$text, $text, // kann '' sein → wird in SQL nicht überschrieben
$checksum, $checksum,
$mtime $mtime,
$now
); );
} }
} }
@@ -91,7 +92,11 @@ class PdfIndexService
private function normalizePdfUrl(string $url): ?string private function normalizePdfUrl(string $url): ?string
{ {
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!$parts) {
return null;
}
// 1) files/...pdf (ohne führenden Slash) // 1) files/...pdf (ohne führenden Slash)
if ( if (
@@ -149,23 +154,29 @@ class PdfIndexService
string $title, string $title,
string $text, string $text,
string $checksum, string $checksum,
int $mtime int $mtime,
int $now
): void { ): void {
Database::getInstance() Database::getInstance()
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, last_seen, type, url, title, text, checksum, file_mtime)
VALUES VALUES
(?, ?, ?, ?, ?, ?) (?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
tstamp=VALUES(tstamp), tstamp = VALUES(tstamp),
url=VALUES(url), last_seen = VALUES(last_seen),
title=VALUES(title), type = VALUES(type),
text=VALUES(text), url = VALUES(url),
file_mtime=VALUES(file_mtime) title = VALUES(title),
checksum = VALUES(checksum),
file_mtime = VALUES(file_mtime),
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
') ')
->execute( ->execute(
time(), $now,
$now,
'pdf',
$url, $url,
$title, $title,
$text, $text,
@@ -178,8 +189,8 @@ class PdfIndexService
{ {
try { try {
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseFile($absolutePath); $pdf = $parser->parseFile($absolutePath);
$text = $this->cleanPdfContent($pdf->getText()); $text = $this->cleanPdfContent($pdf->getText());
return mb_substr($text, 0, 20000); return mb_substr($text, 0, 20000);
} catch (\Throwable) { } catch (\Throwable) {
@@ -190,8 +201,8 @@ class PdfIndexService
private function readPdfMetaTitle(string $absolutePath): ?string private function readPdfMetaTitle(string $absolutePath): ?string
{ {
try { try {
$parser = new Parser(); $parser = new Parser();
$pdf = $parser->parseFile($absolutePath); $pdf = $parser->parseFile($absolutePath);
$details = $pdf->getDetails(); $details = $pdf->getDetails();
foreach (['Title', 'title'] as $key) { foreach (['Title', 'title'] as $key) {
@@ -203,6 +214,7 @@ class PdfIndexService
} }
} }
} catch (\Throwable) { } catch (\Throwable) {
// ignore
} }
return null; return null;