remove table reset
This commit is contained in:
@@ -30,22 +30,6 @@ class IndexPageListener
|
|||||||
'set_keys' => array_keys($set),
|
'set_keys' => array_keys($set),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
/*
|
|
||||||
* =====================
|
|
||||||
* PDF: Reset genau 1× pro Crawl
|
|
||||||
* =====================
|
|
||||||
*/
|
|
||||||
try {
|
|
||||||
$this->debug('PDF resetTableOnce(): call');
|
|
||||||
$this->pdfIndexService->resetTableOnce();
|
|
||||||
$this->debug('PDF resetTableOnce(): ok');
|
|
||||||
} catch (\Throwable $e) {
|
|
||||||
$this->debug('PDF resetTableOnce(): failed', [
|
|
||||||
'error' => $e->getMessage(),
|
|
||||||
'class' => $e::class,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* =====================
|
* =====================
|
||||||
* SEITEN-METADATEN
|
* SEITEN-METADATEN
|
||||||
|
|||||||
@@ -8,10 +8,10 @@ $GLOBALS['TL_DCA']['tl_search_pdf'] = [
|
|||||||
'sql' => [
|
'sql' => [
|
||||||
'keys' => [
|
'keys' => [
|
||||||
'id' => 'primary',
|
'id' => 'primary',
|
||||||
'checksum' => 'unique',
|
|
||||||
'page_id' => 'index',
|
'page_id' => 'index',
|
||||||
'url' => 'index',
|
'url' => 'unique',
|
||||||
'type' => 'index',
|
'type' => 'index',
|
||||||
|
'checksum' => 'index',
|
||||||
'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance)
|
'last_seen' => 'index', // ⬅️ NEU (für Cleanup-Performance)
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,9 +12,6 @@ class OfficeIndexService
|
|||||||
{
|
{
|
||||||
private string $projectDir;
|
private string $projectDir;
|
||||||
|
|
||||||
// pro Crawl-Durchlauf: doppelte Verarbeitung vermeiden
|
|
||||||
private array $seenThisCrawl = [];
|
|
||||||
|
|
||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||||
@@ -25,6 +22,10 @@ class OfficeIndexService
|
|||||||
*/
|
*/
|
||||||
public function handleOfficeLinks(array $officeLinks): void
|
public function handleOfficeLinks(array $officeLinks): void
|
||||||
{
|
{
|
||||||
|
// Dedupe nur pro Aufruf (nicht "pro Crawl")
|
||||||
|
$seen = [];
|
||||||
|
$now = time();
|
||||||
|
|
||||||
foreach ($officeLinks as $row) {
|
foreach ($officeLinks as $row) {
|
||||||
$url = (string) ($row['url'] ?? '');
|
$url = (string) ($row['url'] ?? '');
|
||||||
$linkText = $row['linkText'] ?? null;
|
$linkText = $row['linkText'] ?? null;
|
||||||
@@ -33,13 +34,12 @@ class OfficeIndexService
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
// doppelte URLs pro Aufruf vermeiden
|
||||||
// innerhalb des Crawls gleiche URL nicht mehrfach parsen
|
|
||||||
$seenKey = md5($url);
|
$seenKey = md5($url);
|
||||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
if (isset($seen[$seenKey])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$this->seenThisCrawl[$seenKey] = true;
|
$seen[$seenKey] = true;
|
||||||
|
|
||||||
$normalized = $this->normalizeOfficeUrl($url);
|
$normalized = $this->normalizeOfficeUrl($url);
|
||||||
if ($normalized === null) {
|
if ($normalized === null) {
|
||||||
@@ -56,27 +56,37 @@ class OfficeIndexService
|
|||||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||||
$checksum = md5($relativePath . '|' . $mtime);
|
$checksum = md5($relativePath . '|' . $mtime);
|
||||||
|
|
||||||
$title = $linkText ?: basename($absolutePath);
|
// existiert bereits?
|
||||||
|
$existing = Database::getInstance()
|
||||||
|
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
|
||||||
|
->execute($relativePath)
|
||||||
|
->fetchAssoc();
|
||||||
|
|
||||||
|
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
|
||||||
|
|
||||||
|
// Titel-Priorität:
|
||||||
|
// 1) Linktext
|
||||||
|
// 2) Dateiname
|
||||||
|
$title = $linkText ?: basename($absolutePath);
|
||||||
|
$text = '';
|
||||||
|
|
||||||
|
if ($needsParse) {
|
||||||
$text = $this->parseOfficeFile($absolutePath, $type);
|
$text = $this->parseOfficeFile($absolutePath, $type);
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
|
// Parsing fehlgeschlagen → nichts überschreiben
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$this->upsertOffice(
|
$this->upsertOffice(
|
||||||
$relativePath,
|
$relativePath,
|
||||||
$title,
|
$title,
|
||||||
$text,
|
$text, // kann '' sein → SQL überschreibt dann nicht
|
||||||
$checksum,
|
$checksum,
|
||||||
$mtime,
|
$mtime,
|
||||||
$type
|
$type,
|
||||||
|
$now
|
||||||
);
|
);
|
||||||
|
|
||||||
} catch (\Throwable $e) {
|
|
||||||
error_log(
|
|
||||||
'[ContaoMeilisearch] Office indexing failed for "' . $url . '": ' . $e->getMessage()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,6 +98,10 @@ class OfficeIndexService
|
|||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
|
if (!$parts) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
// 1) files/... (ohne führenden Slash)
|
// 1) files/... (ohne führenden Slash)
|
||||||
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
|
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
|
||||||
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
||||||
@@ -117,8 +131,8 @@ class OfficeIndexService
|
|||||||
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
||||||
|
|
||||||
if (
|
if (
|
||||||
str_starts_with($file, 'files/')
|
str_starts_with($file, 'files/') &&
|
||||||
&& in_array($ext, ['docx', 'xlsx', 'pptx'], true)
|
in_array($ext, ['docx', 'xlsx', 'pptx'], true)
|
||||||
) {
|
) {
|
||||||
return ['/' . $file, $ext];
|
return ['/' . $file, $ext];
|
||||||
}
|
}
|
||||||
@@ -148,25 +162,28 @@ class OfficeIndexService
|
|||||||
string $text,
|
string $text,
|
||||||
string $checksum,
|
string $checksum,
|
||||||
int $mtime,
|
int $mtime,
|
||||||
string $type
|
string $type,
|
||||||
|
int $now
|
||||||
): void {
|
): void {
|
||||||
try {
|
|
||||||
Database::getInstance()
|
Database::getInstance()
|
||||||
->prepare('
|
->prepare('
|
||||||
INSERT INTO tl_search_pdf
|
INSERT INTO tl_search_pdf
|
||||||
(tstamp, type, url, title, text, checksum, file_mtime)
|
(tstamp, last_seen, type, url, title, text, checksum, file_mtime)
|
||||||
VALUES
|
VALUES
|
||||||
(?, ?, ?, ?, ?, ?, ?)
|
(?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON DUPLICATE KEY UPDATE
|
ON DUPLICATE KEY UPDATE
|
||||||
tstamp = VALUES(tstamp),
|
tstamp = VALUES(tstamp),
|
||||||
|
last_seen = VALUES(last_seen),
|
||||||
type = VALUES(type),
|
type = VALUES(type),
|
||||||
url = VALUES(url),
|
url = VALUES(url),
|
||||||
title = VALUES(title),
|
title = VALUES(title),
|
||||||
text=VALUES(text),
|
checksum = VALUES(checksum),
|
||||||
file_mtime=VALUES(file_mtime)
|
file_mtime = VALUES(file_mtime),
|
||||||
|
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
|
||||||
')
|
')
|
||||||
->execute(
|
->execute(
|
||||||
time(),
|
$now,
|
||||||
|
$now,
|
||||||
$type,
|
$type,
|
||||||
$url,
|
$url,
|
||||||
$title,
|
$title,
|
||||||
@@ -174,11 +191,6 @@ class OfficeIndexService
|
|||||||
$checksum,
|
$checksum,
|
||||||
$mtime
|
$mtime
|
||||||
);
|
);
|
||||||
} catch (\Throwable $e) {
|
|
||||||
error_log(
|
|
||||||
'[ContaoMeilisearch] Failed to write Office index entry (' . $url . '): ' . $e->getMessage()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private function parseOfficeFile(string $absolutePath, string $type): string
|
private function parseOfficeFile(string $absolutePath, string $type): string
|
||||||
@@ -206,10 +218,7 @@ class OfficeIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
return $this->cleanText($text);
|
return $this->cleanText($text);
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable) {
|
||||||
error_log(
|
|
||||||
'[ContaoMeilisearch] Failed to parse DOCX "' . $absolutePath . '": ' . $e->getMessage()
|
|
||||||
);
|
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -227,10 +236,7 @@ class OfficeIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
return $this->cleanText($text);
|
return $this->cleanText($text);
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable) {
|
||||||
error_log(
|
|
||||||
'[ContaoMeilisearch] Failed to parse XLSX "' . $absolutePath . '": ' . $e->getMessage()
|
|
||||||
);
|
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -250,10 +256,7 @@ class OfficeIndexService
|
|||||||
}
|
}
|
||||||
|
|
||||||
return $this->cleanText($text);
|
return $this->cleanText($text);
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable) {
|
||||||
error_log(
|
|
||||||
'[ContaoMeilisearch] Failed to parse PPTX "' . $absolutePath . '": ' . $e->getMessage()
|
|
||||||
);
|
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,34 +10,20 @@ class PdfIndexService
|
|||||||
{
|
{
|
||||||
private string $projectDir;
|
private string $projectDir;
|
||||||
|
|
||||||
private bool $didReset = false;
|
|
||||||
private array $seenThisCrawl = [];
|
|
||||||
|
|
||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Wird aus dem Listener beim ersten Hook-Call pro Crawl aufgerufen.
|
|
||||||
*/
|
|
||||||
public function resetTableOnce(): void
|
|
||||||
{
|
|
||||||
if ($this->didReset) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->didReset = true;
|
|
||||||
$this->seenThisCrawl = [];
|
|
||||||
|
|
||||||
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
|
* @param array<int,array{url:string,linkText:?string}> $pdfLinks
|
||||||
*/
|
*/
|
||||||
public function handlePdfLinks(array $pdfLinks): void
|
public function handlePdfLinks(array $pdfLinks): void
|
||||||
{
|
{
|
||||||
|
// Dedupe nur pro Aufruf (nicht "pro Crawl")
|
||||||
|
$seen = [];
|
||||||
|
$now = time();
|
||||||
|
|
||||||
foreach ($pdfLinks as $row) {
|
foreach ($pdfLinks as $row) {
|
||||||
$url = (string) ($row['url'] ?? '');
|
$url = (string) ($row['url'] ?? '');
|
||||||
$linkText = $row['linkText'] ?? null;
|
$linkText = $row['linkText'] ?? null;
|
||||||
@@ -46,12 +32,12 @@ class PdfIndexService
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// innerhalb eines Crawls doppelte URLs vermeiden
|
// doppelte URLs pro Aufruf vermeiden
|
||||||
$seenKey = md5($url);
|
$seenKey = md5($url);
|
||||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
if (isset($seen[$seenKey])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$this->seenThisCrawl[$seenKey] = true;
|
$seen[$seenKey] = true;
|
||||||
|
|
||||||
$normalizedPath = $this->normalizePdfUrl($url);
|
$normalizedPath = $this->normalizePdfUrl($url);
|
||||||
if ($normalizedPath === null) {
|
if ($normalizedPath === null) {
|
||||||
@@ -66,24 +52,39 @@ class PdfIndexService
|
|||||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||||
$checksum = md5($normalizedPath . '|' . $mtime);
|
$checksum = md5($normalizedPath . '|' . $mtime);
|
||||||
|
|
||||||
|
// existiert bereits?
|
||||||
|
$existing = Database::getInstance()
|
||||||
|
->prepare('SELECT checksum FROM tl_search_pdf WHERE url=? LIMIT 1')
|
||||||
|
->execute($normalizedPath)
|
||||||
|
->fetchAssoc();
|
||||||
|
|
||||||
|
$needsParse = !$existing || ($existing['checksum'] ?? '') !== $checksum;
|
||||||
|
|
||||||
// Titel-Priorität:
|
// Titel-Priorität:
|
||||||
// 1) Linktext
|
// 1) Linktext
|
||||||
// 2) PDF-Metadaten
|
// 2) PDF-Metadaten
|
||||||
// 3) Dateiname
|
// 3) Dateiname
|
||||||
|
$title = $linkText ?: basename($absolutePath);
|
||||||
|
$text = '';
|
||||||
|
|
||||||
|
if ($needsParse) {
|
||||||
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
||||||
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
||||||
|
|
||||||
$text = $this->parsePdf($absolutePath);
|
$text = $this->parsePdf($absolutePath);
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
|
// wenn parsing fehlschlägt, NICHT überschreiben
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$this->upsertPdf(
|
$this->upsertPdf(
|
||||||
$normalizedPath,
|
$normalizedPath,
|
||||||
$title,
|
$title,
|
||||||
$text,
|
$text, // kann '' sein → wird in SQL nicht überschrieben
|
||||||
$checksum,
|
$checksum,
|
||||||
$mtime
|
$mtime,
|
||||||
|
$now
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -93,6 +94,10 @@ class PdfIndexService
|
|||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
|
if (!$parts) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
// 1) files/...pdf (ohne führenden Slash)
|
// 1) files/...pdf (ohne führenden Slash)
|
||||||
if (
|
if (
|
||||||
!empty($parts['path'])
|
!empty($parts['path'])
|
||||||
@@ -149,23 +154,29 @@ class PdfIndexService
|
|||||||
string $title,
|
string $title,
|
||||||
string $text,
|
string $text,
|
||||||
string $checksum,
|
string $checksum,
|
||||||
int $mtime
|
int $mtime,
|
||||||
|
int $now
|
||||||
): void {
|
): void {
|
||||||
Database::getInstance()
|
Database::getInstance()
|
||||||
->prepare('
|
->prepare('
|
||||||
INSERT INTO tl_search_pdf
|
INSERT INTO tl_search_pdf
|
||||||
(tstamp, url, title, text, checksum, file_mtime)
|
(tstamp, last_seen, type, url, title, text, checksum, file_mtime)
|
||||||
VALUES
|
VALUES
|
||||||
(?, ?, ?, ?, ?, ?)
|
(?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
ON DUPLICATE KEY UPDATE
|
ON DUPLICATE KEY UPDATE
|
||||||
tstamp = VALUES(tstamp),
|
tstamp = VALUES(tstamp),
|
||||||
|
last_seen = VALUES(last_seen),
|
||||||
|
type = VALUES(type),
|
||||||
url = VALUES(url),
|
url = VALUES(url),
|
||||||
title = VALUES(title),
|
title = VALUES(title),
|
||||||
text=VALUES(text),
|
checksum = VALUES(checksum),
|
||||||
file_mtime=VALUES(file_mtime)
|
file_mtime = VALUES(file_mtime),
|
||||||
|
text = IF(VALUES(text) = "" OR VALUES(text) IS NULL, text, VALUES(text))
|
||||||
')
|
')
|
||||||
->execute(
|
->execute(
|
||||||
time(),
|
$now,
|
||||||
|
$now,
|
||||||
|
'pdf',
|
||||||
$url,
|
$url,
|
||||||
$title,
|
$title,
|
||||||
$text,
|
$text,
|
||||||
@@ -203,6 +214,7 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (\Throwable) {
|
} catch (\Throwable) {
|
||||||
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|||||||
Reference in New Issue
Block a user