fix: detect and fallback garbled PDF text to OCR (#13366) (#13404)

## Problem When PDF fonts lack ToUnicode/CMap mappings, pdfplumber (pdfminer) cannot map CIDs to correct Unicode characters, outputting PUA characters (U+E000~U+F8FF) or `(cid:xxx)` placeholders. The original code fully trusted pdfplumber text without any garbled detection, causing garbled output in the final parsed result. Relates to #13366 ## Solution ### 1. Garbled text detection functions - `_is_garbled_char(ch)`: Detects PUA characters (BMP/Plane 15/16), replacement character U+FFFD, control characters, and unassigned/surrogate codepoints - `_is_garbled_text(text, threshold)`: Calculates garbled ratio and detects `(cid:xxx)` patterns ### 2. Box-level fallback (in `__ocr()`) When a text box has ≥50% garbled characters, discard pdfplumber text and fallback to OCR recognition. ### 3. Page-level detection (in `__images__()`) Sample characters from each page; if garbled rate ≥30%, clear all pdfplumber characters for that page, forcing full OCR. ### 4. Layout recognizer CID filtering Filter out `(cid:xxx)` patterns in `layout_recognizer.py` text processing to prevent them from polluting layout analysis. ## Testing - 29 unit tests covering: normal CJK/English text, PUA characters, CID patterns, mixed text, boundary thresholds, edge cases - All 85 existing project unit tests pass without regression
2026-05-03 16:57:48 +08:00 · 2026-03-10 11:20:31 +08:00
parent 7f6a9e8ee9
commit 292a1a8566
3 changed files with 619 additions and 6 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -22,6 +22,7 @@ import random
 import re
 import sys
 import threading
+import unicodedata
 from collections import Counter, defaultdict
 from copy import deepcopy
 from io import BytesIO
@ -197,6 +198,127 @@ class RAGFlowPdfParser:
                    return False
        return True

+    # CID pattern regex for unmapped font characters from pdfminer
+    _CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)")
+
+    @staticmethod
+    def _is_garbled_char(ch):
+        """Check if a single character is garbled (unmappable from PDF font encoding).
+
+        A character is considered garbled if it falls into Unicode Private Use Areas
+        or certain replacement/control character ranges that typically indicate
+        pdfminer failed to map a CID to a valid Unicode codepoint.
+        """
+        if not ch:
+            return False
+        cp = ord(ch)
+        if 0xE000 <= cp <= 0xF8FF:
+            return True
+        if 0xF0000 <= cp <= 0xFFFFF:
+            return True
+        if 0x100000 <= cp <= 0x10FFFF:
+            return True
+        if cp == 0xFFFD:
+            return True
+        if cp < 0x20 and ch not in ('\t', '\n', '\r'):
+            return True
+        if 0x80 <= cp <= 0x9F:
+            return True
+        cat = unicodedata.category(ch)
+        if cat in ("Cn", "Cs"):
+            return True
+        return False
+
+    @staticmethod
+    def _is_garbled_text(text, threshold=0.5):
+        """Check if a text string contains too many garbled characters.
+
+        Examines each character and determines if the overall proportion
+        of garbled characters exceeds the given threshold. Also detects
+        pdfminer's CID placeholder patterns like '(cid:123)'.
+        """
+        if not text or not text.strip():
+            return False
+        if RAGFlowPdfParser._CID_PATTERN.search(text):
+            return True
+        garbled_count = 0
+        total = 0
+        for ch in text:
+            if ch.isspace():
+                continue
+            total += 1
+            if RAGFlowPdfParser._is_garbled_char(ch):
+                garbled_count += 1
+        if total == 0:
+            return False
+        return garbled_count / total >= threshold
+
+    @staticmethod
+    def _has_subset_font_prefix(fontname):
+        """Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1').
+
+        PDF subset fonts use a 6-letter uppercase tag followed by '+' before
+        the actual font name. Some tools use shorter tags (e.g. 'DY1+').
+        """
+        if not fontname:
+            return False
+        return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname))
+
+    @staticmethod
+    def _is_garbled_by_font_encoding(page_chars, min_chars=20):
+        """Detect garbled text caused by broken font encoding mappings.
+
+        Some PDFs (especially older Chinese standards) embed custom fonts that
+        map CJK glyphs to ASCII codepoints. The extracted text appears as
+        random ASCII punctuation/symbols instead of actual CJK characters.
+
+        Detection strategy: if a significant proportion of characters come from
+        subset-embedded fonts and the page produces overwhelmingly ASCII
+        (punctuation, digits, symbols) with virtually no CJK/Hangul/Kana
+        characters, the page is likely garbled due to broken font encoding.
+        """
+        if not page_chars or len(page_chars) < min_chars:
+            return False
+
+        subset_font_count = 0
+        total_non_space = 0
+        ascii_punct_sym = 0
+        cjk_like = 0
+
+        for c in page_chars:
+            text = c.get("text", "")
+            fontname = c.get("fontname", "")
+            if not text or text.isspace():
+                continue
+            total_non_space += 1
+
+            if RAGFlowPdfParser._has_subset_font_prefix(fontname):
+                subset_font_count += 1
+
+            cp = ord(text[0])
+            if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF
+                    or 0x20000 <= cp <= 0x2FA1F
+                    or 0xAC00 <= cp <= 0xD7AF
+                    or 0x3040 <= cp <= 0x30FF):
+                cjk_like += 1
+            elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40
+                    or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E):
+                ascii_punct_sym += 1
+
+        if total_non_space < min_chars:
+            return False
+
+        subset_ratio = subset_font_count / total_non_space
+        if subset_ratio < 0.3:
+            return False
+
+        cjk_ratio = cjk_like / total_non_space
+        punct_ratio = ascii_punct_sym / total_non_space
+        if cjk_ratio < 0.05 and punct_ratio > 0.4:
+            return True
+
+        return False
+
    def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
        """
        Evaluate the best rotation orientation for a table image.
@ -618,14 +740,40 @@ class RAGFlowPdfParser:
            if not b["chars"]:
                del b["chars"]
                continue
-            m_ht = np.mean([c["height"] for c in b["chars"]])
-            for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
+            box_chars = b["chars"]
+            m_ht = np.mean([c["height"] for c in box_chars])
+            garbled_count = 0
+            total_count = 0
+            for c in Recognizer.sort_Y_firstly(box_chars, m_ht):
                if c["text"] == " " and b["text"]:
                    if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
                        b["text"] += " "
                else:
                    b["text"] += c["text"]
+                    for ch in c["text"]:
+                        if not ch.isspace():
+                            total_count += 1
+                            if self._is_garbled_char(ch):
+                                garbled_count += 1
            del b["chars"]
+            # If the majority of characters from pdfplumber are garbled,
+            # clear the text so OCR recognition will be used as fallback.
+            # Strategy 1: PUA / unmapped CID characters
+            if total_count > 0 and garbled_count / total_count >= 0.5:
+                logging.info(
+                    "Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)",
+                    pagenum, garbled_count, total_count, b["x0"], b["top"],
+                )
+                b["text"] = ""
+                continue
+            # Strategy 2: font-encoding garbling — all chars are ASCII
+            # punctuation from subset fonts (no CJK output)
+            if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5):
+                logging.info(
+                    "Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)",
+                    pagenum, total_count, b["x0"], b["top"],
+                )
+                b["text"] = ""

        logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
        start = timer()
@ -1400,6 +1548,34 @@ class RAGFlowPdfParser:
                        logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
                        self.page_chars = [[] for _ in range(page_to - page_from)]  # If failed to extract, using empty list instead.

+                    # Detect garbled pages and clear their chars so the OCR
+                    # path will be used instead. Two detection strategies:
+                    # 1) PUA / unmapped CID characters (threshold=0.3)
+                    # 2) Font-encoding garbling: subset fonts mapping CJK to ASCII
+                    for pi, page_ch in enumerate(self.page_chars):
+                        if not page_ch:
+                            continue
+                        # Strategy 1: PUA / CID garbling
+                        sample = page_ch if len(page_ch) <= 200 else page_ch[:200]
+                        sample_text = "".join(c.get("text", "") for c in sample)
+                        if self._is_garbled_text(sample_text, threshold=0.3):
+                            logging.warning(
+                                "Page %d: pdfplumber extracted mostly garbled characters (%d chars), "
+                                "clearing to use OCR fallback.",
+                                page_from + pi + 1, len(page_ch),
+                            )
+                            self.page_chars[pi] = []
+                            continue
+                        # Strategy 2: font-encoding garbling (CJK mapped to ASCII)
+                        if self._is_garbled_by_font_encoding(page_ch):
+                            logging.warning(
+                                "Page %d: detected font-encoding garbled text "
+                                "(subset fonts with no CJK output, %d chars), "
+                                "clearing to use OCR fallback.",
+                                page_from + pi + 1, len(page_ch),
+                            )
+                            self.page_chars[pi] = []
+
                    self.total_page = len(self.pdf.pages)

        except Exception as e: