mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-03 16:57:48 +08:00
## Problem When PDF fonts lack ToUnicode/CMap mappings, pdfplumber (pdfminer) cannot map CIDs to correct Unicode characters, outputting PUA characters (U+E000~U+F8FF) or `(cid:xxx)` placeholders. The original code fully trusted pdfplumber text without any garbled detection, causing garbled output in the final parsed result. Relates to #13366 ## Solution ### 1. Garbled text detection functions - `_is_garbled_char(ch)`: Detects PUA characters (BMP/Plane 15/16), replacement character U+FFFD, control characters, and unassigned/surrogate codepoints - `_is_garbled_text(text, threshold)`: Calculates garbled ratio and detects `(cid:xxx)` patterns ### 2. Box-level fallback (in `__ocr()`) When a text box has ≥50% garbled characters, discard pdfplumber text and fallback to OCR recognition. ### 3. Page-level detection (in `__images__()`) Sample characters from each page; if garbled rate ≥30%, clear all pdfplumber characters for that page, forcing full OCR. ### 4. Layout recognizer CID filtering Filter out `(cid:xxx)` patterns in `layout_recognizer.py` text processing to prevent them from polluting layout analysis. ## Testing - 29 unit tests covering: normal CJK/English text, PUA characters, CID patterns, mixed text, boundary thresholds, edge cases - All 85 existing project unit tests pass without regression
This commit is contained in:
@ -22,6 +22,7 @@ import random
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import unicodedata
|
||||
from collections import Counter, defaultdict
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
@ -197,6 +198,127 @@ class RAGFlowPdfParser:
|
||||
return False
|
||||
return True
|
||||
|
||||
# CID pattern regex for unmapped font characters from pdfminer
|
||||
_CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)")
|
||||
|
||||
@staticmethod
|
||||
def _is_garbled_char(ch):
|
||||
"""Check if a single character is garbled (unmappable from PDF font encoding).
|
||||
|
||||
A character is considered garbled if it falls into Unicode Private Use Areas
|
||||
or certain replacement/control character ranges that typically indicate
|
||||
pdfminer failed to map a CID to a valid Unicode codepoint.
|
||||
"""
|
||||
if not ch:
|
||||
return False
|
||||
cp = ord(ch)
|
||||
if 0xE000 <= cp <= 0xF8FF:
|
||||
return True
|
||||
if 0xF0000 <= cp <= 0xFFFFF:
|
||||
return True
|
||||
if 0x100000 <= cp <= 0x10FFFF:
|
||||
return True
|
||||
if cp == 0xFFFD:
|
||||
return True
|
||||
if cp < 0x20 and ch not in ('\t', '\n', '\r'):
|
||||
return True
|
||||
if 0x80 <= cp <= 0x9F:
|
||||
return True
|
||||
cat = unicodedata.category(ch)
|
||||
if cat in ("Cn", "Cs"):
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_garbled_text(text, threshold=0.5):
|
||||
"""Check if a text string contains too many garbled characters.
|
||||
|
||||
Examines each character and determines if the overall proportion
|
||||
of garbled characters exceeds the given threshold. Also detects
|
||||
pdfminer's CID placeholder patterns like '(cid:123)'.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return False
|
||||
if RAGFlowPdfParser._CID_PATTERN.search(text):
|
||||
return True
|
||||
garbled_count = 0
|
||||
total = 0
|
||||
for ch in text:
|
||||
if ch.isspace():
|
||||
continue
|
||||
total += 1
|
||||
if RAGFlowPdfParser._is_garbled_char(ch):
|
||||
garbled_count += 1
|
||||
if total == 0:
|
||||
return False
|
||||
return garbled_count / total >= threshold
|
||||
|
||||
@staticmethod
|
||||
def _has_subset_font_prefix(fontname):
|
||||
"""Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1').
|
||||
|
||||
PDF subset fonts use a 6-letter uppercase tag followed by '+' before
|
||||
the actual font name. Some tools use shorter tags (e.g. 'DY1+').
|
||||
"""
|
||||
if not fontname:
|
||||
return False
|
||||
return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname))
|
||||
|
||||
@staticmethod
|
||||
def _is_garbled_by_font_encoding(page_chars, min_chars=20):
|
||||
"""Detect garbled text caused by broken font encoding mappings.
|
||||
|
||||
Some PDFs (especially older Chinese standards) embed custom fonts that
|
||||
map CJK glyphs to ASCII codepoints. The extracted text appears as
|
||||
random ASCII punctuation/symbols instead of actual CJK characters.
|
||||
|
||||
Detection strategy: if a significant proportion of characters come from
|
||||
subset-embedded fonts and the page produces overwhelmingly ASCII
|
||||
(punctuation, digits, symbols) with virtually no CJK/Hangul/Kana
|
||||
characters, the page is likely garbled due to broken font encoding.
|
||||
"""
|
||||
if not page_chars or len(page_chars) < min_chars:
|
||||
return False
|
||||
|
||||
subset_font_count = 0
|
||||
total_non_space = 0
|
||||
ascii_punct_sym = 0
|
||||
cjk_like = 0
|
||||
|
||||
for c in page_chars:
|
||||
text = c.get("text", "")
|
||||
fontname = c.get("fontname", "")
|
||||
if not text or text.isspace():
|
||||
continue
|
||||
total_non_space += 1
|
||||
|
||||
if RAGFlowPdfParser._has_subset_font_prefix(fontname):
|
||||
subset_font_count += 1
|
||||
|
||||
cp = ord(text[0])
|
||||
if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF
|
||||
or 0x20000 <= cp <= 0x2FA1F
|
||||
or 0xAC00 <= cp <= 0xD7AF
|
||||
or 0x3040 <= cp <= 0x30FF):
|
||||
cjk_like += 1
|
||||
elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40
|
||||
or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E):
|
||||
ascii_punct_sym += 1
|
||||
|
||||
if total_non_space < min_chars:
|
||||
return False
|
||||
|
||||
subset_ratio = subset_font_count / total_non_space
|
||||
if subset_ratio < 0.3:
|
||||
return False
|
||||
|
||||
cjk_ratio = cjk_like / total_non_space
|
||||
punct_ratio = ascii_punct_sym / total_non_space
|
||||
if cjk_ratio < 0.05 and punct_ratio > 0.4:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
|
||||
"""
|
||||
Evaluate the best rotation orientation for a table image.
|
||||
@ -618,14 +740,40 @@ class RAGFlowPdfParser:
|
||||
if not b["chars"]:
|
||||
del b["chars"]
|
||||
continue
|
||||
m_ht = np.mean([c["height"] for c in b["chars"]])
|
||||
for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
|
||||
box_chars = b["chars"]
|
||||
m_ht = np.mean([c["height"] for c in box_chars])
|
||||
garbled_count = 0
|
||||
total_count = 0
|
||||
for c in Recognizer.sort_Y_firstly(box_chars, m_ht):
|
||||
if c["text"] == " " and b["text"]:
|
||||
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
|
||||
b["text"] += " "
|
||||
else:
|
||||
b["text"] += c["text"]
|
||||
for ch in c["text"]:
|
||||
if not ch.isspace():
|
||||
total_count += 1
|
||||
if self._is_garbled_char(ch):
|
||||
garbled_count += 1
|
||||
del b["chars"]
|
||||
# If the majority of characters from pdfplumber are garbled,
|
||||
# clear the text so OCR recognition will be used as fallback.
|
||||
# Strategy 1: PUA / unmapped CID characters
|
||||
if total_count > 0 and garbled_count / total_count >= 0.5:
|
||||
logging.info(
|
||||
"Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)",
|
||||
pagenum, garbled_count, total_count, b["x0"], b["top"],
|
||||
)
|
||||
b["text"] = ""
|
||||
continue
|
||||
# Strategy 2: font-encoding garbling — all chars are ASCII
|
||||
# punctuation from subset fonts (no CJK output)
|
||||
if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5):
|
||||
logging.info(
|
||||
"Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)",
|
||||
pagenum, total_count, b["x0"], b["top"],
|
||||
)
|
||||
b["text"] = ""
|
||||
|
||||
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
|
||||
start = timer()
|
||||
@ -1400,6 +1548,34 @@ class RAGFlowPdfParser:
|
||||
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
||||
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
||||
|
||||
# Detect garbled pages and clear their chars so the OCR
|
||||
# path will be used instead. Two detection strategies:
|
||||
# 1) PUA / unmapped CID characters (threshold=0.3)
|
||||
# 2) Font-encoding garbling: subset fonts mapping CJK to ASCII
|
||||
for pi, page_ch in enumerate(self.page_chars):
|
||||
if not page_ch:
|
||||
continue
|
||||
# Strategy 1: PUA / CID garbling
|
||||
sample = page_ch if len(page_ch) <= 200 else page_ch[:200]
|
||||
sample_text = "".join(c.get("text", "") for c in sample)
|
||||
if self._is_garbled_text(sample_text, threshold=0.3):
|
||||
logging.warning(
|
||||
"Page %d: pdfplumber extracted mostly garbled characters (%d chars), "
|
||||
"clearing to use OCR fallback.",
|
||||
page_from + pi + 1, len(page_ch),
|
||||
)
|
||||
self.page_chars[pi] = []
|
||||
continue
|
||||
# Strategy 2: font-encoding garbling (CJK mapped to ASCII)
|
||||
if self._is_garbled_by_font_encoding(page_ch):
|
||||
logging.warning(
|
||||
"Page %d: detected font-encoding garbled text "
|
||||
"(subset fonts with no CJK output, %d chars), "
|
||||
"clearing to use OCR fallback.",
|
||||
page_from + pi + 1, len(page_ch),
|
||||
)
|
||||
self.page_chars[pi] = []
|
||||
|
||||
self.total_page = len(self.pdf.pages)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user