remove table reset

This commit is contained in:
Jürgen Mummert
2026-01-05 10:37:21 +01:00
parent cf0a84b85e
commit 6ea558bbca
4 changed files with 144 additions and 145 deletions
-16
View File
@@ -30,22 +30,6 @@ class IndexPageListener
'set_keys' => array_keys($set), 'set_keys' => array_keys($set),
]); ]);
/*
* =====================
* PDF: Reset genau 1× pro Crawl
* =====================
*/
try {
$this->debug('PDF resetTableOnce(): call');
$this->pdfIndexService->resetTableOnce();
$this->debug('PDF resetTableOnce(): ok');
} catch (\Throwable $e) {
$this->debug('PDF resetTableOnce(): failed', [
'error' => $e->getMessage(),
'class' => $e::class,
]);
}
/* /*
* ===================== * =====================
* SEITEN-METADATEN * SEITEN-METADATEN
+2 -2
View File
@@ -8,10 +8,10 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
'sql' => [ 'sql' => [
'keys' => [ 'keys' => [
'id' => 'primary', 'id' => 'primary',
'checksum' => 'unique',
'page_id' => 'index', 'page_id' => 'index',
'url' => 'index', 'url' => 'unique',
'type' => 'index', 'type' => 'index',
'checksum' => 'index',
'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance) 'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance)
], ],
], ],
+45 -42
View File
@@ -12,9 +12,6 @@ class OfficeIndexService
{ {
private string $projectDir; private string $projectDir;
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
@@ -25,6 +22,10 @@ class OfficeIndexService
*/ */
public function handleOfficeLinks(array $officeLinks): void public function handleOfficeLinks(array $officeLinks): void
{ {
// Dedupe nur pro Aufruf (nicht "pro Crawl")
$seen = [];
$now = time();
foreach ($officeLinks as $row) { foreach ($officeLinks as $row) {
$url = (string) ($row['url'] ?? ''); $url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null; $linkText = $row['linkText'] ?? null;
@@ -33,13 +34,12 @@ class OfficeIndexService
continue; continue;
} }
try { // doppelte URLs pro Aufruf vermeiden
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($seen[$seenKey])) {
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $seen[$seenKey] = true;
$normalized = $this->normalizeOfficeUrl($url); $normalized = $this->normalizeOfficeUrl($url);
if ($normalized === null) { if ($normalized === null) {
@@ -56,27 +56,37 @@ class OfficeIndexService
$mtime = (int) (filemtime($absolutePath) ?: 0); $mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($relativePath . '|' . $mtime); $checksum = md5($relativePath . '|' . $mtime);
$title = $linkText ?: basename($absolutePath); // existiert bereits?
$existing = Database::getInstance()
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
->execute($relativePath)
->fetchAssoc();
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
// Titel-Priorität:
// 1) Linktext
// 2) Dateiname
$title = $linkText ?: basename($absolutePath);
$text = '';
if ($needsParse) {
$text = $this->parseOfficeFile($absolutePath, $type); $text = $this->parseOfficeFile($absolutePath, $type);
if ($text === '') { if ($text === '') {
// Parsing fehlgeschlagen → nichts überschreiben
continue; continue;
} }
}
$this->upsertOffice( $this->upsertOffice(
$relativePath, $relativePath,
$title, $title,
$text, $text, // kann '' sein → SQL überschreibt dann nicht
$checksum, $checksum,
$mtime, $mtime,
$type $type,
$now
); );
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
);
}
} }
} }
@@ -88,6 +98,10 @@ class OfficeIndexService
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!$parts) {
return null;
}
// 1) files/... (ohne führenden Slash) // 1) files/... (ohne führenden Slash)
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) { if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
@@ -117,8 +131,8 @@ class OfficeIndexService
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION)); $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
if ( if (
str_starts_with($file, 'files/') str_starts_with($file, 'files/') &&
&& in_array($ext, ['docx', 'xlsx', 'pptx'], true) in_array($ext, ['docx', 'xlsx', 'pptx'], true)
) { ) {
return ['/' . $file, $ext]; return ['/' . $file, $ext];
} }
@@ -148,25 +162,28 @@ class OfficeIndexService
string $text, string $text,
string $checksum, string $checksum,
int $mtime, int $mtime,
string $type string $type,
int $now
): void { ): void {
try {
Database::getInstance() Database::getInstance()
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, type, url, title, text, checksum, file_mtime) (tstamp, last_seen, type, url, title, text, checksum, file_mtime)
VALUES VALUES
(?, ?, ?, ?, ?, ?, ?) (?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
tstamp = VALUES(tstamp), tstamp = VALUES(tstamp),
last_seen = VALUES(last_seen),
type = VALUES(type), type = VALUES(type),
url = VALUES(url), url = VALUES(url),
title = VALUES(title), title = VALUES(title),
text=VALUES(text), checksum = VALUES(checksum),
file_mtime=VALUES(file_mtime) file_mtime = VALUES(file_mtime),
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
') ')
->execute( ->execute(
time(), $now,
$now,
$type, $type,
$url, $url,
$title, $title,
@@ -174,11 +191,6 @@ class OfficeIndexService
$checksum, $checksum,
$mtime $mtime
); );
} catch (\Throwable $e) {
error_log(
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
);
}
} }
private function parseOfficeFile(string $absolutePath, string $type): string private function parseOfficeFile(string $absolutePath, string $type): string
@@ -206,10 +218,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -227,10 +236,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
@@ -250,10 +256,7 @@ class OfficeIndexService
} }
return $this->cleanText($text); return $this->cleanText($text);
} catch (\Throwable $e) { } catch (\Throwable) {
error_log(
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
);
return ''; return '';
} }
} }
+41 -29
View File
@@ -10,34 +10,20 @@ class PdfIndexService
{ {
private string $projectDir; private string $projectDir;
private bool $didReset = false;
private array $seenThisCrawl = [];
public function __construct(ParameterBagInterface $params) public function __construct(ParameterBagInterface $params)
{ {
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/'); $this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
} }
/**
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
*/
public function resetTableOnce(): void
{
if ($this->didReset) {
return;
}
$this->didReset = true;
$this->seenThisCrawl = [];
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
}
/** /**
* @param array<int,array{url:string,linkText:?string}> $pdfLinks * @param array<int,array{url:string,linkText:?string}> $pdfLinks
*/ */
public function handlePdfLinks(array $pdfLinks): void public function handlePdfLinks(array $pdfLinks): void
{ {
// Dedupe nur pro Aufruf (nicht "pro Crawl")
$seen = [];
$now = time();
foreach ($pdfLinks as $row) { foreach ($pdfLinks as $row) {
$url = (string) ($row['url'] ?? ''); $url = (string) ($row['url'] ?? '');
$linkText = $row['linkText'] ?? null; $linkText = $row['linkText'] ?? null;
@@ -46,12 +32,12 @@ class PdfIndexService
continue; continue;
} }
// innerhalb eines Crawls doppelte URLs vermeiden // doppelte URLs pro Aufruf vermeiden
$seenKey = md5($url); $seenKey = md5($url);
if (isset($this->seenThisCrawl[$seenKey])) { if (isset($seen[$seenKey])) {
continue; continue;
} }
$this->seenThisCrawl[$seenKey] = true; $seen[$seenKey] = true;
$normalizedPath = $this->normalizePdfUrl($url); $normalizedPath = $this->normalizePdfUrl($url);
if ($normalizedPath === null) { if ($normalizedPath === null) {
@@ -66,24 +52,39 @@ class PdfIndexService
$mtime = (int) (filemtime($absolutePath) ?: 0); $mtime = (int) (filemtime($absolutePath) ?: 0);
$checksum = md5($normalizedPath . '|' . $mtime); $checksum = md5($normalizedPath . '|' . $mtime);
// existiert bereits?
$existing = Database::getInstance()
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
->execute($normalizedPath)
->fetchAssoc();
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
// Titel-Priorität: // Titel-Priorität:
// 1) Linktext // 1) Linktext
// 2) PDF-Metadaten // 2) PDF-Metadaten
// 3) Dateiname // 3) Dateiname
$title = $linkText ?: basename($absolutePath);
$text = '';
if ($needsParse) {
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath); $pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath)); $title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
$text = $this->parsePdf($absolutePath); $text = $this->parsePdf($absolutePath);
if ($text === '') { if ($text === '') {
// wenn parsing fehlschlägt, NICHT überschreiben
continue; continue;
} }
}
$this->upsertPdf( $this->upsertPdf(
$normalizedPath, $normalizedPath,
$title, $title,
$text, $text, // kann '' sein → wird in SQL nicht überschrieben
$checksum, $checksum,
$mtime $mtime,
$now
); );
} }
} }
@@ -93,6 +94,10 @@ class PdfIndexService
$decoded = html_entity_decode($url); $decoded = html_entity_decode($url);
$parts = parse_url($decoded); $parts = parse_url($decoded);
if (!$parts) {
return null;
}
// 1) files/...pdf (ohne führenden Slash) // 1) files/...pdf (ohne führenden Slash)
if ( if (
!empty($parts['path']) !empty($parts['path'])
@@ -149,23 +154,29 @@ class PdfIndexService
string $title, string $title,
string $text, string $text,
string $checksum, string $checksum,
int $mtime int $mtime,
int $now
): void { ): void {
Database::getInstance() Database::getInstance()
->prepare(' ->prepare('
INSERT INTO tl_search_pdf INSERT INTO tl_search_pdf
(tstamp, url, title, text, checksum, file_mtime) (tstamp, last_seen, type, url, title, text, checksum, file_mtime)
VALUES VALUES
(?, ?, ?, ?, ?, ?) (?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
tstamp = VALUES(tstamp), tstamp = VALUES(tstamp),
last_seen = VALUES(last_seen),
type = VALUES(type),
url = VALUES(url), url = VALUES(url),
title = VALUES(title), title = VALUES(title),
text=VALUES(text), checksum = VALUES(checksum),
file_mtime=VALUES(file_mtime) file_mtime = VALUES(file_mtime),
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
') ')
->execute( ->execute(
time(), $now,
$now,
'pdf',
$url, $url,
$title, $title,
$text, $text,
@@ -203,6 +214,7 @@ class PdfIndexService
} }
} }
} catch (\Throwable) { } catch (\Throwable) {
// ignore
} }
return null; return null;