setName('meilisearch:files:parse') ->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files') ->addOption( 'limit', null, InputOption::VALUE_OPTIONAL, 'Maximum number of files to check per run' ) ->addOption( 'dry-run', null, InputOption::VALUE_NONE, 'Do not send files to Tika' ); } protected function execute(InputInterface $input, OutputInterface $output): int { $this->framework->initialize(); $this->log('Parser gestartet'); $dryRun = (bool) $input->getOption('dry-run'); $limitOption = $input->getOption('limit'); $limit = $limitOption !== null ? max(1, (int) $limitOption) : null; $tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/'); if ($tikaUrl === '') { $output->writeln('Tika URL not configured'); return Command::FAILURE; } $db = Database::getInstance(); $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; if ($limit !== null) { $sql .= " LIMIT " . (int) $limit; } $files = $db->query($sql)->fetchAllAssoc(); if (!$files) { $this->log('No files to parse'); return Command::SUCCESS; } $client = HttpClient::create([ 'timeout' => 180, ]); foreach ($files as $file) { $originalUrl = (string) $file['url']; $existingTitle = trim((string) ($file['title'] ?? '')); $normalized = $originalUrl; // ------------------------------------------------- // Normalize URL // ------------------------------------------------- if (str_contains($normalized, '?')) { $parts = parse_url($normalized); if (!empty($parts['query'])) { parse_str($parts['query'], $query); if (!empty($query['file'])) { $normalized = (string) $query['file']; } else { $this->log('Not a direct file url, skip', ['url' => $originalUrl]); continue; } } } $normalized = strtok($normalized, '#'); $normalized = rawurldecode($normalized); $normalized = ltrim($normalized, '/'); if (!str_starts_with($normalized, 'files/')) { $this->log('Not in files/, skip', ['url' => $originalUrl]); continue; } $root = defined('TL_ROOT') ? TL_ROOT : $this->framework->getContainer()->getParameter('kernel.project_dir') . '/public'; $absolutePath = $root . '/' . $normalized; if (!is_file($absolutePath)) { $this->log('File missing, skip', [ 'url' => $originalUrl, 'path' => $absolutePath, ]); continue; } $mtime = filemtime($absolutePath) ?: 0; $checksum = md5($normalized . '|' . $mtime); // ------------------------------------------------- // Skip unchanged // ------------------------------------------------- if ($file['checksum'] === $checksum && !empty($file['text'])) { continue; } if ($dryRun) { $output->writeln('[DRY-RUN] Would parse: ' . $normalized); continue; } // ------------------------------------------------- // MIME-Type // ------------------------------------------------- $ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION)); $mimeType = match ($ext) { 'pdf' => 'application/pdf', 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', default => null, }; if ($mimeType === null) { $this->log('Unsupported file type, skip', ['url' => $normalized]); continue; } // ------------------------------------------------- // Tika BODY (roher Plaintext) // ------------------------------------------------- try { $this->log('Parsing file', ['url' => $normalized]); $bodyResponse = $client->request( 'PUT', $tikaUrl . '/tika/main', [ 'headers' => [ 'Accept' => 'text/plain', 'Content-Type' => $mimeType, ], 'body' => fopen($absolutePath, 'rb'), ] ); $text = trim((string) $bodyResponse->getContent(false)); } catch (\Throwable $e) { $this->log('Body parse failed', [ 'url' => $normalized, 'error' => $e->getMessage(), ]); continue; } // ------------------------------------------------- // TITLE: keep existing editor-defined title // ------------------------------------------------- $title = $existingTitle !== '' ? $existingTitle : null; // ------------------------------------------------- // Tika METADATA (Title) – only if no existing title // ------------------------------------------------- if ($title === null) { try { $metaResponse = $client->request( 'PUT', $tikaUrl . '/meta', [ 'headers' => [ 'Accept' => 'application/json', 'Content-Type' => $mimeType, ], 'body' => fopen($absolutePath, 'rb'), ] ); $meta = json_decode($metaResponse->getContent(false), true); $rawTitle = $meta['dc:title'][0] ?? $meta['pdf:docinfo:title'][0] ?? null; if ($rawTitle) { $title = html_entity_decode( $rawTitle, ENT_QUOTES | ENT_HTML5, 'UTF-8' ); } } catch (\Throwable) { // Metadata optional } } // ------------------------------------------------- // TITLE → ASCII SAFE (only if newly generated) // ------------------------------------------------- if ($existingTitle === '' && $title) { $title = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $title); $title = preg_replace('/\s+/', ' ', $title); $title = trim($title); } // ------------------------------------------------- // FALLBACK: Dateiname (only if still empty) // ------------------------------------------------- if (!$title || strlen($title) < 5) { $title = pathinfo($normalized, PATHINFO_FILENAME); $title = str_replace(['_', '-'], ' ', $title); $title = preg_replace('/\s+/', ' ', $title); $title = trim($title); } // ------------------------------------------------- // Store result // ------------------------------------------------- $db->prepare( "UPDATE tl_search_files SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ? WHERE id = ?" )->execute( $text, $title, $checksum, $mtime, time(), $file['id'] ); $this->log('File parsed', [ 'url' => $normalized, 'chars' => mb_strlen($text), 'title' => $title, ]); } $this->log('Parser finished'); return Command::SUCCESS; } private function log(string $message, array $context = []): void { $ctx = $context ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) : ''; error_log('[MeilisearchFilesParse] ' . $message . $ctx); } }