fix: detect and fallback garbled PDF text to OCR (#13366) (#13404)

## Problem

When PDF fonts lack ToUnicode/CMap mappings, pdfplumber (pdfminer)
cannot map CIDs to correct Unicode characters, outputting PUA characters
(U+E000~U+F8FF) or `(cid:xxx)` placeholders. The original code fully
trusted pdfplumber text without any garbled detection, causing garbled
output in the final parsed result.

Relates to #13366

## Solution

### 1. Garbled text detection functions
- `_is_garbled_char(ch)`: Detects PUA characters (BMP/Plane 15/16),
replacement character U+FFFD, control characters, and
unassigned/surrogate codepoints
- `_is_garbled_text(text, threshold)`: Calculates garbled ratio and
detects `(cid:xxx)` patterns

### 2. Box-level fallback (in `__ocr()`)
When a text box has ≥50% garbled characters, discard pdfplumber text and
fallback to OCR recognition.

### 3. Page-level detection (in `__images__()`)
Sample characters from each page; if garbled rate ≥30%, clear all
pdfplumber characters for that page, forcing full OCR.

### 4. Layout recognizer CID filtering
Filter out `(cid:xxx)` patterns in `layout_recognizer.py` text
processing to prevent them from polluting layout analysis.

## Testing
- 29 unit tests covering: normal CJK/English text, PUA characters, CID
patterns, mixed text, boundary thresholds, edge cases
- All 85 existing project unit tests pass without regression
This commit is contained in:
tunsuy
2026-03-10 11:20:31 +08:00
committed by GitHub
parent 7f6a9e8ee9
commit 292a1a8566
3 changed files with 619 additions and 6 deletions

View File

@ -22,6 +22,7 @@ import random
import re
import sys
import threading
import unicodedata
from collections import Counter, defaultdict
from copy import deepcopy
from io import BytesIO
@ -197,6 +198,127 @@ class RAGFlowPdfParser:
return False
return True
# CID pattern regex for unmapped font characters from pdfminer
_CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)")
@staticmethod
def _is_garbled_char(ch):
"""Check if a single character is garbled (unmappable from PDF font encoding).
A character is considered garbled if it falls into Unicode Private Use Areas
or certain replacement/control character ranges that typically indicate
pdfminer failed to map a CID to a valid Unicode codepoint.
"""
if not ch:
return False
cp = ord(ch)
if 0xE000 <= cp <= 0xF8FF:
return True
if 0xF0000 <= cp <= 0xFFFFF:
return True
if 0x100000 <= cp <= 0x10FFFF:
return True
if cp == 0xFFFD:
return True
if cp < 0x20 and ch not in ('\t', '\n', '\r'):
return True
if 0x80 <= cp <= 0x9F:
return True
cat = unicodedata.category(ch)
if cat in ("Cn", "Cs"):
return True
return False
@staticmethod
def _is_garbled_text(text, threshold=0.5):
"""Check if a text string contains too many garbled characters.
Examines each character and determines if the overall proportion
of garbled characters exceeds the given threshold. Also detects
pdfminer's CID placeholder patterns like '(cid:123)'.
"""
if not text or not text.strip():
return False
if RAGFlowPdfParser._CID_PATTERN.search(text):
return True
garbled_count = 0
total = 0
for ch in text:
if ch.isspace():
continue
total += 1
if RAGFlowPdfParser._is_garbled_char(ch):
garbled_count += 1
if total == 0:
return False
return garbled_count / total >= threshold
@staticmethod
def _has_subset_font_prefix(fontname):
"""Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1').
PDF subset fonts use a 6-letter uppercase tag followed by '+' before
the actual font name. Some tools use shorter tags (e.g. 'DY1+').
"""
if not fontname:
return False
return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname))
@staticmethod
def _is_garbled_by_font_encoding(page_chars, min_chars=20):
"""Detect garbled text caused by broken font encoding mappings.
Some PDFs (especially older Chinese standards) embed custom fonts that
map CJK glyphs to ASCII codepoints. The extracted text appears as
random ASCII punctuation/symbols instead of actual CJK characters.
Detection strategy: if a significant proportion of characters come from
subset-embedded fonts and the page produces overwhelmingly ASCII
(punctuation, digits, symbols) with virtually no CJK/Hangul/Kana
characters, the page is likely garbled due to broken font encoding.
"""
if not page_chars or len(page_chars) < min_chars:
return False
subset_font_count = 0
total_non_space = 0
ascii_punct_sym = 0
cjk_like = 0
for c in page_chars:
text = c.get("text", "")
fontname = c.get("fontname", "")
if not text or text.isspace():
continue
total_non_space += 1
if RAGFlowPdfParser._has_subset_font_prefix(fontname):
subset_font_count += 1
cp = ord(text[0])
if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF
or 0x20000 <= cp <= 0x2FA1F
or 0xAC00 <= cp <= 0xD7AF
or 0x3040 <= cp <= 0x30FF):
cjk_like += 1
elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40
or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E):
ascii_punct_sym += 1
if total_non_space < min_chars:
return False
subset_ratio = subset_font_count / total_non_space
if subset_ratio < 0.3:
return False
cjk_ratio = cjk_like / total_non_space
punct_ratio = ascii_punct_sym / total_non_space
if cjk_ratio < 0.05 and punct_ratio > 0.4:
return True
return False
def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
"""
Evaluate the best rotation orientation for a table image.
@ -618,14 +740,40 @@ class RAGFlowPdfParser:
if not b["chars"]:
del b["chars"]
continue
m_ht = np.mean([c["height"] for c in b["chars"]])
for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
box_chars = b["chars"]
m_ht = np.mean([c["height"] for c in box_chars])
garbled_count = 0
total_count = 0
for c in Recognizer.sort_Y_firstly(box_chars, m_ht):
if c["text"] == " " and b["text"]:
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
b["text"] += " "
else:
b["text"] += c["text"]
for ch in c["text"]:
if not ch.isspace():
total_count += 1
if self._is_garbled_char(ch):
garbled_count += 1
del b["chars"]
# If the majority of characters from pdfplumber are garbled,
# clear the text so OCR recognition will be used as fallback.
# Strategy 1: PUA / unmapped CID characters
if total_count > 0 and garbled_count / total_count >= 0.5:
logging.info(
"Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)",
pagenum, garbled_count, total_count, b["x0"], b["top"],
)
b["text"] = ""
continue
# Strategy 2: font-encoding garbling — all chars are ASCII
# punctuation from subset fonts (no CJK output)
if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5):
logging.info(
"Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)",
pagenum, total_count, b["x0"], b["top"],
)
b["text"] = ""
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
start = timer()
@ -1400,6 +1548,34 @@ class RAGFlowPdfParser:
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
# Detect garbled pages and clear their chars so the OCR
# path will be used instead. Two detection strategies:
# 1) PUA / unmapped CID characters (threshold=0.3)
# 2) Font-encoding garbling: subset fonts mapping CJK to ASCII
for pi, page_ch in enumerate(self.page_chars):
if not page_ch:
continue
# Strategy 1: PUA / CID garbling
sample = page_ch if len(page_ch) <= 200 else page_ch[:200]
sample_text = "".join(c.get("text", "") for c in sample)
if self._is_garbled_text(sample_text, threshold=0.3):
logging.warning(
"Page %d: pdfplumber extracted mostly garbled characters (%d chars), "
"clearing to use OCR fallback.",
page_from + pi + 1, len(page_ch),
)
self.page_chars[pi] = []
continue
# Strategy 2: font-encoding garbling (CJK mapped to ASCII)
if self._is_garbled_by_font_encoding(page_ch):
logging.warning(
"Page %d: detected font-encoding garbled text "
"(subset fonts with no CJK output, %d chars), "
"clearing to use OCR fallback.",
page_from + pi + 1, len(page_ch),
)
self.page_chars[pi] = []
self.total_page = len(self.pdf.pages)
except Exception as e: