feat: PaddleOCR PDF parser supports thumnails and positions (#12565)

### What problem does this PR solve?

1. PaddleOCR PDF parser supports thumnails and positions.
2. Add FAQ documentation for PaddleOCR PDF parser.


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lin Manhui
2026-01-13 09:51:08 +08:00
committed by GitHub
parent 44bada64c9
commit 4fe3c24198
4 changed files with 259 additions and 60 deletions

View File

@ -24,7 +24,10 @@ from os import PathLike
from pathlib import Path
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
import numpy as np
import pdfplumber
import requests
from PIL import Image
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str:
class PaddleOCRVLConfig:
"""Configuration for PaddleOCR-VL algorithm."""
use_doc_orientation_classify: Optional[bool] = None
use_doc_unwarping: Optional[bool] = None
use_doc_orientation_classify: Optional[bool] = False
use_doc_unwarping: Optional[bool] = False
use_layout_detection: Optional[bool] = None
use_polygon_points: Optional[bool] = None
use_chart_recognition: Optional[bool] = None
@ -79,7 +82,7 @@ class PaddleOCRVLConfig:
min_pixels: Optional[int] = None
max_pixels: Optional[int] = None
max_new_tokens: Optional[int] = None
merge_layout_blocks: Optional[bool] = None
merge_layout_blocks: Optional[bool] = False
markdown_ignore_labels: Optional[List[str]] = None
vlm_extra_args: Optional[dict] = None
@ -116,14 +119,12 @@ class PaddleOCRConfig:
if algorithm == "PaddleOCR-VL":
# Create default PaddleOCRVLConfig object and convert to dict
algorithm_config = asdict(PaddleOCRVLConfig())
# Apply user-provided VL config
vl_config = cfg.get("vl")
if isinstance(vl_config, dict):
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
algorithm_config_user = cfg.get("algorithm_config")
if isinstance(algorithm_config_user, dict):
algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})
# Remove processed keys
cfg.pop("vl", None)
cfg.pop("algorithm_config", None)
# Prepare initialization arguments
field_names = {field.name for field in fields(cls)}
@ -146,6 +147,8 @@ class PaddleOCRConfig:
class PaddleOCRParser(RAGFlowPdfParser):
"""Parser for PDF documents using PaddleOCR API."""
_ZOOMIN = 2
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
"prettify_markdown": "prettifyMarkdown",
"show_formula_number": "showFormulaNumber",
@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
request_timeout: int = 600,
):
"""Initialize PaddleOCR parser."""
super().__init__()
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
self.algorithm = algorithm
@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser):
# Force PDF file type
self.file_type = 0
# Initialize page images for cropping
self.page_images: list[Image.Image] = []
self.page_from = 0
# Public methods
def check_installation(self) -> tuple[bool, str]:
"""Check if the parser is properly installed and configured."""
@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
show_formula_number: Optional[bool] = None,
visualize: Optional[bool] = None,
additional_params: Optional[dict[str, Any]] = None,
vl_config: Optional[dict[str, Any]] = None,
algorithm_config: Optional[dict[str, Any]] = None,
**kwargs: Any,
) -> ParseResult:
"""Parse PDF document using PaddleOCR API."""
@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser):
config_dict["visualize"] = visualize
if additional_params is not None:
config_dict["additional_params"] = additional_params
if vl_config is not None:
config_dict["vl"] = vl_config
# Add any VL config parameters from kwargs
for key, value in kwargs.items():
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
config_dict[key] = value
if algorithm_config is not None:
config_dict["algorithm_config"] = algorithm_config
cfg = PaddleOCRConfig.from_dict(config_dict)
if not cfg.api_url:
raise RuntimeError("[PaddleOCR] API URL missing")
# Prepare file data
# Prepare file data and generate page images for cropping
data_bytes = self._prepare_file_data(filepath, binary)
# Generate page images for cropping functionality
input_source = filepath if binary is None else binary
try:
self.__images__(input_source, callback=callback)
except Exception as e:
self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
# Build and send request
result = self._send_request(data_bytes, cfg, callback)
@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
label = block.get("block_label", "")
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##"
if parse_method == "manual":
sections.append((block_content, label, tag))
@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser):
"""Convert API response to table tuples."""
return []
def __images__(self, fnm, page_from=0, page_to=100, callback=None):
"""Generate page images from PDF for cropping."""
self.page_from = page_from
self.page_to = page_to
try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf
self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e:
self.page_images = None
self.logger.exception(e)
@staticmethod
def extract_positions(txt: str):
"""Extract position information from text tags."""
poss = []
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss
def crop(self, text: str, need_position: bool = False):
"""Crop images from PDF based on position tags in text."""
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
return None, None
return
if not getattr(self, "page_images", None):
self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
if need_position:
return None, None
return
page_count = len(self.page_images)
filtered_poss = []
for pns, left, right, top, bottom in poss:
if not pns:
self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
continue
valid_pns = [p for p in pns if 0 <= p < page_count]
if not valid_pns:
self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
continue
filtered_poss.append((valid_pns, left, right, top, bottom))
poss = filtered_poss
if not poss:
self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
first_page_idx = pos[0][0]
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count):
self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position:
return None, None
return
last_page_height = self.page_images[last_page_idx].size[1]
poss.append(
(
[last_page_idx],
pos[1],
pos[2],
min(last_page_height, pos[4] + GAP),
min(last_page_height, pos[4] + 120),
)
)
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if bottom <= top:
bottom = top + 2
for pn in pns[1:]:
if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1]
else:
self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count):
self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
crop0 = img0.crop((x0, y0, x1, y1))
imgs.append(crop0)
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
bottom -= img0.size[1]
for pn in pns[1:]:
if not (0 <= pn < page_count):
self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))
imgs.append(cimgp)
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]
if not imgs:
if need_position:
return None, None
return
height = 0
for img in imgs:
height += img.size[1] + GAP
height = int(height)
width = int(np.max([i.size[0] for i in imgs]))
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
pic.paste(img, (0, int(height)))
height += img.size[1] + GAP
if need_position:
return pic, positions
return pic
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

View File

@ -566,3 +566,82 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do
:::tip NOTE
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
:::
### How to use PaddleOCR for document parsing?
From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files.
There are two main ways to configure and use PaddleOCR in RAGFlow:
#### 1. Using PaddleOCR Official API
This method uses PaddleOCR's official API service with an access token.
**Step 1: Configure RAGFlow**
- **Via Environment Variables:**
```bash
# In your docker/.env file:
PADDLEOCR_API_URL=https://your-paddleocr-api-endpoint
PADDLEOCR_ALGORITHM=PaddleOCR-VL
PADDLEOCR_ACCESS_TOKEN=your-access-token-here
```
- **Via UI:**
- Navigate to **Model providers** page
- Add a new OCR model with factory type "PaddleOCR"
- Configure the following fields:
- **PaddleOCR API URL**: Your PaddleOCR API endpoint
- **PaddleOCR Algorithm**: Select the algorithm corresponding to the API endpoint
- **AI Studio Access Token**: Your access token for the PaddleOCR API
**Step 2: Usage in Dataset Configuration**
- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
**Notes:**
- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr/task), click the **API** button in the upper-left corner, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`.
- Access tokens can be obtained from the [AI Studio platform](https://aistudio.baidu.com/account/accessToken).
- This method requires internet connectivity to reach the official PaddleOCR API.
#### 2. Using Self-Hosted PaddleOCR Service
This method allows you to deploy your own PaddleOCR service and use it without an access token.
**Step 1: Deploy PaddleOCR Service**
Follow the [PaddleOCR serving documentation](https://www.paddleocr.ai/latest/en/version3.x/deployment/serving.html) to deploy your own service. For layout parsing, you can use an endpoint like:
```bash
http://localhost:8080/layout-parsing
```
**Step 2: Configure RAGFlow**
- **Via Environment Variables:**
```bash
PADDLEOCR_API_URL=http://localhost:8080/layout-parsing
PADDLEOCR_ALGORITHM=PaddleOCR-VL
# No access token required for self-hosted service
```
- **Via UI:**
- Navigate to **Model providers** page
- Add a new OCR model with factory type "PaddleOCR"
- Configure the following fields:
- **PaddleOCR API URL**: The endpoint of your deployed service
- **PaddleOCR Algorithm**: Select the algorithm corresponding to the deployed service
- **AI Studio Access Token**: Leave empty
**Step 3: Usage in Dataset Configuration**
- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
#### Environment Variables Summary
| Environment Variable | Description | Default | Required |
|---------------------|-------------|---------|----------|
| `PADDLEOCR_API_URL` | PaddleOCR API endpoint URL | `""` | Yes, when using environment variables |
| `PADDLEOCR_ALGORITHM` | Algorithm to use for parsing | `"PaddleOCR-VL"` | No |
| `PADDLEOCR_ACCESS_TOKEN` | Access token for official API | `None` | Only when using official API |
Environment variables can be used for auto-provisioning, but are not required if configuring via UI. When environment variables are set, these values are used to auto-provision a PaddleOCR model for the tenant on first use.

View File

@ -358,48 +358,14 @@ class Parser(ProcessBase):
parse_method=conf.get("paddleocr_parse_method", "raw"),
)
bboxes = []
for section in lines:
# PaddleOCRParser returns sections as tuple, different formats based on parse_method:
# - "raw": (text, position_tag)
# - "manual": (text, label, position_tag)
# - "paper": (text_with_tag, label)
text = section[0]
# Parse position tag if exists
position_tag = ""
if len(section) > 1:
if len(section) == 2: # raw format: (text, tag)
position_tag = section[1]
elif len(section) == 3: # manual format: (text, label, tag)
position_tag = section[2]
elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
# paper format: text may contain tag
text_with_tag = text
import re
tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
if tag_match:
position_tag = tag_match.group(1)
text = text_with_tag.replace(position_tag, "").strip()
# Extract coordinate information from position tag
page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
if position_tag:
import re
tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if tag_match:
pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
page_number = int(pn.split("-")[0]) # Take first page number
x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
for t, poss in lines:
# Get cropped image and positions
cropped_image, positions = pdf_parser.crop(poss, need_position=True)
box = {
"text": text,
"page_number": page_number,
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
"text": t,
"image": cropped_image,
"positions": positions,
}
bboxes.append(box)
else:

View File

@ -122,7 +122,7 @@ const PaddleOCRModal = ({
disabled={loading}
className="btn btn-primary"
>
{loading ? t('common.adding') : t('common.add')}
{t('common.add')}
</button>
</div>
</form>