diff --git a/src/Command/MeilisearchFilesParseCommand.php b/src/Command/MeilisearchFilesParseCommand.php new file mode 100644 index 0000000..2b866df --- /dev/null +++ b/src/Command/MeilisearchFilesParseCommand.php @@ -0,0 +1,150 @@ +setName('meilisearch:files:parse') + ->setDescription('Parse indexed files via Apache Tika and store extracted text') + ->addOption( + 'limit', + null, + InputOption::VALUE_OPTIONAL, + 'Maximum number of files to parse per run', + 20 + ) + ->addOption( + 'dry-run', + null, + InputOption::VALUE_NONE, + 'Do not send files to Tika, just show what would be parsed' + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->framework->initialize(); + + $this->log('Parser gestartet'); + + $limit = max(1, (int) $input->getOption('limit')); + $dryRun = (bool) $input->getOption('dry-run'); + + $tikaUrl = rtrim((string) $GLOBALS['TL_CONFIG']['meilisearch_tika_url'], '/'); + if ($tikaUrl === '') { + $output->writeln('Tika URL not configured'); + return Command::FAILURE; + } + + $db = Database::getInstance(); + + $files = $db + ->prepare( + "SELECT * FROM tl_search_files + ORDER BY tstamp ASC + LIMIT ?" + ) + ->execute($limit) + ->fetchAllAssoc(); + + if (!$files) { + $this->log('No files to parse'); + return Command::SUCCESS; + } + + $client = HttpClient::create([ + 'timeout' => 120, + ]); + + foreach ($files as $file) { + + $absolutePath = TL_ROOT . '/' . ltrim($file['url'], '/'); + if (!is_file($absolutePath)) { + $this->log('File missing, skip', ['url' => $file['url']]); + continue; + } + + $mtime = filemtime($absolutePath) ?: 0; + $checksum = md5($file['url'] . '|' . $mtime); + + if ($file['checksum'] === $checksum && !empty($file['text'])) { + $this->log('Skip unchanged file', ['url' => $file['url']]); + continue; + } + + if ($dryRun) { + $output->writeln('[DRY-RUN] Would parse: ' . $file['url']); + continue; + } + + try { + $this->log('Parsing file', ['url' => $file['url']]); + + $response = $client->request( + 'PUT', + $tikaUrl . '/tika', + [ + 'headers' => [ + 'Accept' => 'text/plain', + ], + 'body' => fopen($absolutePath, 'rb'), + ] + ); + + $text = trim((string) $response->getContent(false)); + + $db->prepare( + "UPDATE tl_search_files + SET text = ?, checksum = ?, file_mtime = ?, tstamp = ? + WHERE id = ?" + )->execute( + $text, + $checksum, + $mtime, + time(), + $file['id'] + ); + + $this->log('File parsed', [ + 'url' => $file['url'], + 'chars' => strlen($text), + ]); + + } catch (\Throwable $e) { + $this->log('Parse failed', [ + 'url' => $file['url'], + 'error' => $e->getMessage(), + ]); + } + } + + $this->log('Parser finished'); + return Command::SUCCESS; + } + + private function log(string $message, array $context = []): void + { + $ctx = $context + ? ' | ' . json_encode($context, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE) + : ''; + + error_log('[MeilisearchFilesParse] ' . $message . $ctx); + } +} \ No newline at end of file