Bugfix
This commit is contained in:
@@ -88,7 +88,15 @@ class OfficeIndexService
|
|||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
// direkter /files/-Pfad
|
// 1) files/... (ohne führenden Slash)
|
||||||
|
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/')) {
|
||||||
|
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
||||||
|
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
|
||||||
|
return ['/' . $parts['path'], $ext];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) /files/...
|
||||||
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/')) {
|
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/')) {
|
||||||
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
$ext = strtolower(pathinfo($parts['path'], PATHINFO_EXTENSION));
|
||||||
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
|
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
|
||||||
@@ -96,17 +104,33 @@ class OfficeIndexService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Contao-Download-Link mit ?p=
|
if (empty($parts['query'])) {
|
||||||
if (!empty($parts['query'])) {
|
return null;
|
||||||
parse_str($parts['query'], $query);
|
}
|
||||||
|
|
||||||
if (!empty($query['p'])) {
|
parse_str($parts['query'], $query);
|
||||||
$p = urldecode((string) $query['p']);
|
|
||||||
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
|
|
||||||
|
|
||||||
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
|
// 3) Contao 4: ?file=files/...
|
||||||
return ['/files/' . ltrim($p, '/'), $ext];
|
if (!empty($query['file'])) {
|
||||||
}
|
$file = urldecode((string) $query['file']);
|
||||||
|
$file = ltrim($file, '/');
|
||||||
|
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
||||||
|
|
||||||
|
if (
|
||||||
|
str_starts_with($file, 'files/')
|
||||||
|
&& in_array($ext, ['docx', 'xlsx', 'pptx'], true)
|
||||||
|
) {
|
||||||
|
return ['/' . $file, $ext];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4) Contao 5: ?p=...
|
||||||
|
if (!empty($query['p'])) {
|
||||||
|
$p = urldecode((string) $query['p']);
|
||||||
|
$ext = strtolower(pathinfo($p, PATHINFO_EXTENSION));
|
||||||
|
|
||||||
|
if (in_array($ext, ['docx', 'xlsx', 'pptx'], true)) {
|
||||||
|
return ['/files/' . ltrim($p, '/'), $ext];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,17 +16,26 @@ class PdfIndexService
|
|||||||
public function __construct(ParameterBagInterface $params)
|
public function __construct(ParameterBagInterface $params)
|
||||||
{
|
{
|
||||||
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
$this->projectDir = rtrim((string) $params->get('kernel.project_dir'), '/');
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] projectDir={$this->projectDir}\n");
|
$this->debug("projectDir={$this->projectDir}");
|
||||||
|
}
|
||||||
|
|
||||||
|
private function debug(string $message): void
|
||||||
|
{
|
||||||
|
$stream = \defined('STDERR')
|
||||||
|
? STDERR
|
||||||
|
: fopen('php://stderr', 'wb');
|
||||||
|
|
||||||
|
fwrite($stream, "[Meili PDF DEBUG] {$message}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
public function resetTableOnce(): void
|
public function resetTableOnce(): void
|
||||||
{
|
{
|
||||||
if ($this->didReset) {
|
if ($this->didReset) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): already reset\n");
|
$this->debug('resetTableOnce(): already reset');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] resetTableOnce(): TRUNCATE tl_search_pdf\n");
|
$this->debug('resetTableOnce(): TRUNCATE tl_search_pdf');
|
||||||
|
|
||||||
$this->didReset = true;
|
$this->didReset = true;
|
||||||
$this->seenThisCrawl = [];
|
$this->seenThisCrawl = [];
|
||||||
@@ -34,89 +43,70 @@ class PdfIndexService
|
|||||||
try {
|
try {
|
||||||
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
Database::getInstance()->execute('TRUNCATE tl_search_pdf');
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] TRUNCATE failed: {$e->getMessage()}\n");
|
$this->debug('TRUNCATE failed: ' . $e->getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function handlePdfLinks(array $pdfLinks): void
|
public function handlePdfLinks(array $pdfLinks): void
|
||||||
{
|
{
|
||||||
fwrite(
|
$this->debug('handlePdfLinks(): count=' . count($pdfLinks));
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] handlePdfLinks(): count=" . count($pdfLinks) . "\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
foreach ($pdfLinks as $row) {
|
foreach ($pdfLinks as $row) {
|
||||||
$url = (string) ($row['url'] ?? '');
|
$url = (string) ($row['url'] ?? '');
|
||||||
$linkText = $row['linkText'] ?? null;
|
$linkText = $row['linkText'] ?? null;
|
||||||
|
|
||||||
fwrite(STDERR, "\n[Meili PDF DEBUG] URL={$url}\n");
|
$this->debug("URL={$url}");
|
||||||
|
|
||||||
if ($url === '') {
|
if ($url === '') {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → empty URL, skip\n");
|
$this->debug('→ empty URL, skip');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$seenKey = md5($url);
|
$seenKey = md5($url);
|
||||||
if (isset($this->seenThisCrawl[$seenKey])) {
|
if (isset($this->seenThisCrawl[$seenKey])) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → already processed, skip\n");
|
$this->debug('→ already processed, skip');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$this->seenThisCrawl[$seenKey] = true;
|
$this->seenThisCrawl[$seenKey] = true;
|
||||||
|
|
||||||
$normalizedPath = $this->normalizePdfUrl($url);
|
$normalizedPath = $this->normalizePdfUrl($url);
|
||||||
fwrite(
|
$this->debug('normalizePdfUrl() → ' . ($normalizedPath ?? 'NULL'));
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] normalizePdfUrl() → "
|
|
||||||
. ($normalizedPath ?? 'NULL')
|
|
||||||
. "\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
if ($normalizedPath === null) {
|
if ($normalizedPath === null) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → normalization failed, skip\n");
|
$this->debug('→ normalization failed, skip');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
$absolutePath = $this->getAbsolutePath($normalizedPath);
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] absolutePath={$absolutePath}\n");
|
$this->debug("absolutePath={$absolutePath}");
|
||||||
|
|
||||||
if (!is_file($absolutePath)) {
|
if (!is_file($absolutePath)) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → file does NOT exist\n");
|
$this->debug('→ file does NOT exist');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → file exists\n");
|
$this->debug('→ file exists');
|
||||||
|
|
||||||
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
$mtime = (int) (filemtime($absolutePath) ?: 0);
|
||||||
$checksum = md5($normalizedPath . '|' . $mtime);
|
$checksum = md5($normalizedPath . '|' . $mtime);
|
||||||
|
|
||||||
fwrite(
|
$this->debug("mtime={$mtime} checksum={$checksum}");
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] mtime={$mtime} checksum={$checksum}\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
$pdfMetaTitle = $this->readPdfMetaTitle($absolutePath);
|
||||||
fwrite(
|
$this->debug('metaTitle=' . ($pdfMetaTitle ?: 'NULL'));
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] metaTitle="
|
|
||||||
. ($pdfMetaTitle ?: 'NULL')
|
|
||||||
. "\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
$title = $linkText ?: ($pdfMetaTitle ?: basename($absolutePath));
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] final title={$title}\n");
|
$this->debug("final title={$title}");
|
||||||
|
|
||||||
$text = $this->parsePdf($absolutePath);
|
$text = $this->parsePdf($absolutePath);
|
||||||
fwrite(
|
$this->debug('parsed text length=' . strlen($text));
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] parsed text length=" . strlen($text) . "\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
if ($text === '') {
|
if ($text === '') {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → empty text, skip\n");
|
$this->debug('→ empty text, skip');
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → writing to DB\n");
|
$this->debug('→ writing to DB');
|
||||||
|
|
||||||
$this->upsertPdf(
|
$this->upsertPdf(
|
||||||
$normalizedPath,
|
$normalizedPath,
|
||||||
@@ -130,52 +120,48 @@ class PdfIndexService
|
|||||||
|
|
||||||
private function normalizePdfUrl(string $url): ?string
|
private function normalizePdfUrl(string $url): ?string
|
||||||
{
|
{
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] normalizePdfUrl(): {$url}\n");
|
$this->debug("normalizePdfUrl(): {$url}");
|
||||||
|
|
||||||
$decoded = html_entity_decode($url);
|
$decoded = html_entity_decode($url);
|
||||||
$parts = parse_url($decoded);
|
$parts = parse_url($decoded);
|
||||||
|
|
||||||
// 1) files/...pdf (ohne führenden Slash)
|
|
||||||
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
if (!empty($parts['path']) && str_starts_with($parts['path'], 'files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
||||||
$r = '/' . $parts['path'];
|
$r = '/' . $parts['path'];
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → relative files path {$r}\n");
|
$this->debug("→ relative files path {$r}");
|
||||||
return $r;
|
return $r;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) /files/...pdf
|
|
||||||
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
if (!empty($parts['path']) && str_starts_with($parts['path'], '/files/') && str_ends_with(strtolower($parts['path']), '.pdf')) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → absolute files path {$parts['path']}\n");
|
$this->debug("→ absolute files path {$parts['path']}");
|
||||||
return $parts['path'];
|
return $parts['path'];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (empty($parts['query'])) {
|
if (empty($parts['query'])) {
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → no query\n");
|
$this->debug('→ no query');
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
parse_str($parts['query'], $query);
|
parse_str($parts['query'], $query);
|
||||||
|
|
||||||
// 3) Contao 4: ?file=files/...
|
|
||||||
if (!empty($query['file'])) {
|
if (!empty($query['file'])) {
|
||||||
$file = urldecode((string) $query['file']);
|
$file = urldecode((string) $query['file']);
|
||||||
$file = ltrim($file, '/');
|
$file = ltrim($file, '/');
|
||||||
|
|
||||||
if (str_starts_with($file, 'files/') && str_ends_with(strtolower($file), '.pdf')) {
|
if (str_starts_with($file, 'files/') && str_ends_with(strtolower($file), '.pdf')) {
|
||||||
$r = '/' . $file;
|
$r = '/' . $file;
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → file= normalized {$r}\n");
|
$this->debug("→ file= normalized {$r}");
|
||||||
return $r;
|
return $r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4) Contao 5: ?p=...
|
|
||||||
if (!empty($query['p'])) {
|
if (!empty($query['p'])) {
|
||||||
$p = urldecode((string) $query['p']);
|
$p = urldecode((string) $query['p']);
|
||||||
$r = '/files/' . ltrim($p, '/');
|
$r = '/files/' . ltrim($p, '/');
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → p= normalized {$r}\n");
|
$this->debug("→ p= normalized {$r}");
|
||||||
return $r;
|
return $r;
|
||||||
}
|
}
|
||||||
|
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → no usable parameter\n");
|
$this->debug('→ no usable parameter');
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,12 +200,9 @@ class PdfIndexService
|
|||||||
$mtime
|
$mtime
|
||||||
);
|
);
|
||||||
|
|
||||||
fwrite(STDERR, "[Meili PDF DEBUG] → DB write OK\n");
|
$this->debug('→ DB write OK');
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
fwrite(
|
$this->debug('DB write failed: ' . $e->getMessage());
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] DB write failed: {$e->getMessage()}\n"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -231,10 +214,7 @@ class PdfIndexService
|
|||||||
$text = $this->cleanPdfContent($pdf->getText());
|
$text = $this->cleanPdfContent($pdf->getText());
|
||||||
return mb_substr($text, 0, 20000);
|
return mb_substr($text, 0, 20000);
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
fwrite(
|
$this->debug('parsePdf failed: ' . $e->getMessage());
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] parsePdf failed: {$e->getMessage()}\n"
|
|
||||||
);
|
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -252,10 +232,7 @@ class PdfIndexService
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (\Throwable $e) {
|
} catch (\Throwable $e) {
|
||||||
fwrite(
|
$this->debug('readPdfMetaTitle failed: ' . $e->getMessage());
|
||||||
STDERR,
|
|
||||||
"[Meili PDF DEBUG] readPdfMetaTitle failed: {$e->getMessage()}\n"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|||||||
Reference in New Issue
Block a user