setName('meilisearch:files:parse') ->setDescription('Parse indexed files via Apache Tika and enrich tl_search_files') ->addOption( 'limit', null, InputOption::VALUE_OPTIONAL, 'Maximum number of files to check per run' ) ->addOption( 'dry-run', null, InputOption::VALUE_NONE, 'Do not send files to Tika' ); } protected function execute(InputInterface $input, OutputInterface $output): int { $this->framework->initialize(); $this->log('Parser gestartet'); $dryRun = (bool) $input->getOption('dry-run'); $limitOption = $input->getOption('limit'); $limit = $limitOption !== null ? max(1, (int) $limitOption) : null; $tikaUrl = rtrim((string) ($GLOBALS['TL_CONFIG']['meilisearch_tika_url'] ?? ''), '/'); if ($tikaUrl === '') { $output->writeln('Tika URL not configured'); return Command::FAILURE; } $db = Database::getInstance(); $sql = "SELECT * FROM tl_search_files ORDER BY tstamp ASC"; if ($limit !== null) { $sql .= " LIMIT " . (int) $limit; } $files = $db->query($sql)->fetchAllAssoc(); if (!$files) { $this->log('No files to parse'); return Command::SUCCESS; } $client = HttpClient::create([ 'timeout' => 180, ]); foreach ($files as $file) { $originalUrl = (string) $file['url']; $normalized = $originalUrl; // ------------------------------------------------- // Normalize URL → files/… // ------------------------------------------------- if (str_contains($normalized, '?')) { $parts = parse_url($normalized); if (!empty($parts['query'])) { parse_str($parts['query'], $query); if (!empty($query['file'])) { $normalized = (string) $query['file']; } else { $this->log('Not a direct file url, skip', ['url' => $originalUrl]); continue; } } } $normalized = strtok($normalized, '#'); $normalized = rawurldecode($normalized); $normalized = ltrim($normalized, '/'); if (!str_starts_with($normalized, 'files/')) { $this->log('Not in files/, skip', ['url' => $originalUrl]); continue; } $absolutePath = TL_ROOT . '/' . $normalized; if (!is_file($absolutePath)) { $this->log('File missing, skip', [ 'url' => $originalUrl, 'path' => $absolutePath, ]); continue; } $mtime = filemtime($absolutePath) ?: 0; $checksum = md5($normalized . '|' . $mtime); // ------------------------------------------------- // Skip unchanged // ------------------------------------------------- if ($file['checksum'] === $checksum && !empty($file['text'])) { continue; } if ($dryRun) { $output->writeln('[DRY-RUN] Would parse: ' . $normalized); continue; } // ------------------------------------------------- // MIME-Type // ------------------------------------------------- $ext = strtolower(pathinfo($normalized, PATHINFO_EXTENSION)); $mimeType = match ($ext) { 'pdf' => 'application/pdf', 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', default => null, }; if ($mimeType === null) { $this->log('Unsupported file type, skip', ['url' => $normalized]); continue; } // ------------------------------------------------- // Tika BODY // ------------------------------------------------- try { $this->log('Parsing file', ['url' => $normalized]); $bodyResponse = $client->request( 'PUT', $tikaUrl . '/tika/main', [ 'headers' => [ 'Accept' => 'text/plain', 'Content-Type' => $mimeType, ], 'body' => fopen($absolutePath, 'rb'), ] ); $text = trim((string) $bodyResponse->getContent(false)); } catch (\Throwable $e) { $this->log('Body parse failed', [ 'url' => $normalized, 'error' => $e->getMessage(), ]); continue; } // ------------------------------------------------- // Tika METADATA (Titel) // ------------------------------------------------- $title = null; try { $metaResponse = $client->request( 'PUT', $tikaUrl . '/meta', [ 'headers' => [ 'Accept' => 'application/json', 'Content-Type' => $mimeType, ], 'body' => fopen($absolutePath, 'rb'), ] ); $meta = json_decode($metaResponse->getContent(false), true); $rawTitle = $meta['dc:title'][0] ?? $meta['pdf:docinfo:title'][0] ?? null; if ($rawTitle) { $title = trim( html_entity_decode( $rawTitle, ENT_QUOTES | ENT_HTML5, 'UTF-8' ) ); } } catch (\Throwable) { // Titel ist optional } // ------------------------------------------------- // Store result // ------------------------------------------------- $db->prepare( "UPDATE tl_search_files SET text = ?, title = ?, checksum = ?, file_mtime = ?, tstamp = ? WHERE id = ?" )->execute( $text, $title, $checksum, $mtime, time(), $file['id'] ); $this->log('File parsed', [ 'url' => $normalized, 'chars' => mb_strlen($text), 'title' => $title, ]); } $this->log('Parser finished'); return Command::SUCCESS; } private function log(string $message, array $context = []): void { $ctx = $context ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) : ''; error_log('[MeilisearchFilesParse] ' . $message . $ctx); } }