From 08c1f8a01591ed2c37f442679d0931ddab2b9c90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrgen=20Mummert?= Date: Thu, 25 Dec 2025 14:53:27 +0100 Subject: [PATCH] Bugfix --- src/Service/PdfIndexService.php | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/Service/PdfIndexService.php b/src/Service/PdfIndexService.php index 61807ca..19d12ff 100644 --- a/src/Service/PdfIndexService.php +++ b/src/Service/PdfIndexService.php @@ -162,11 +162,32 @@ class PdfIndexService } } - private function cleanPdfContent(string $content): string + private function cleanPdfContent(string $text): string { - $content = preg_replace('/[\x00-\x1F\x7F]/u', ' ', $content); - $content = preg_replace('/\s+/u', ' ', $content); + // 1. Unicode normalisieren (wichtig!) + if (class_exists(\Normalizer::class)) { + $text = \Normalizer::normalize($text, \Normalizer::FORM_C); + } - return trim($content); + // 2. Musik- & Spezialglyphen entfernen + $text = preg_replace('/[^\p{L}\p{N}\p{P}\p{Z}\n]/u', ' ', $text); + + // 3. Falsche Worttrennungen reparieren: "ges pielt" → "gespielt" + $text = preg_replace('/(?<=\p{L})\s+(?=\p{L})/u', ' ', $text); + + // 4. Spezielle PDF-Apostrophe reparieren + $text = str_replace( + ["\\'", "’", "‘"], + "'", + $text + ); + + // 5. Mehrfache Satzzeichen bereinigen + $text = preg_replace('/([.,;:!?])\1+/', '$1', $text); + + // 6. Überflüssige Leerzeichen & Zeilenumbrüche + $text = preg_replace('/\s+/u', ' ', $text); + + return trim($text); } } \ No newline at end of file