mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 11:45:10 +08:00
feat: PaddleOCR PDF parser supports thumnails and positions (#12565)
### What problem does this PR solve? 1. PaddleOCR PDF parser supports thumnails and positions. 2. Add FAQ documentation for PaddleOCR PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -24,7 +24,10 @@ from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
|
||||
|
||||
import numpy as np
|
||||
import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str:
|
||||
class PaddleOCRVLConfig:
|
||||
"""Configuration for PaddleOCR-VL algorithm."""
|
||||
|
||||
use_doc_orientation_classify: Optional[bool] = None
|
||||
use_doc_unwarping: Optional[bool] = None
|
||||
use_doc_orientation_classify: Optional[bool] = False
|
||||
use_doc_unwarping: Optional[bool] = False
|
||||
use_layout_detection: Optional[bool] = None
|
||||
use_polygon_points: Optional[bool] = None
|
||||
use_chart_recognition: Optional[bool] = None
|
||||
@ -79,7 +82,7 @@ class PaddleOCRVLConfig:
|
||||
min_pixels: Optional[int] = None
|
||||
max_pixels: Optional[int] = None
|
||||
max_new_tokens: Optional[int] = None
|
||||
merge_layout_blocks: Optional[bool] = None
|
||||
merge_layout_blocks: Optional[bool] = False
|
||||
markdown_ignore_labels: Optional[List[str]] = None
|
||||
vlm_extra_args: Optional[dict] = None
|
||||
|
||||
@ -116,14 +119,12 @@ class PaddleOCRConfig:
|
||||
if algorithm == "PaddleOCR-VL":
|
||||
# Create default PaddleOCRVLConfig object and convert to dict
|
||||
algorithm_config = asdict(PaddleOCRVLConfig())
|
||||
|
||||
# Apply user-provided VL config
|
||||
vl_config = cfg.get("vl")
|
||||
if isinstance(vl_config, dict):
|
||||
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
|
||||
algorithm_config_user = cfg.get("algorithm_config")
|
||||
if isinstance(algorithm_config_user, dict):
|
||||
algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})
|
||||
|
||||
# Remove processed keys
|
||||
cfg.pop("vl", None)
|
||||
cfg.pop("algorithm_config", None)
|
||||
|
||||
# Prepare initialization arguments
|
||||
field_names = {field.name for field in fields(cls)}
|
||||
@ -146,6 +147,8 @@ class PaddleOCRConfig:
|
||||
class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Parser for PDF documents using PaddleOCR API."""
|
||||
|
||||
_ZOOMIN = 2
|
||||
|
||||
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
||||
"prettify_markdown": "prettifyMarkdown",
|
||||
"show_formula_number": "showFormulaNumber",
|
||||
@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
request_timeout: int = 600,
|
||||
):
|
||||
"""Initialize PaddleOCR parser."""
|
||||
super().__init__()
|
||||
|
||||
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
|
||||
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
||||
self.algorithm = algorithm
|
||||
@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
# Force PDF file type
|
||||
self.file_type = 0
|
||||
|
||||
# Initialize page images for cropping
|
||||
self.page_images: list[Image.Image] = []
|
||||
self.page_from = 0
|
||||
|
||||
# Public methods
|
||||
def check_installation(self) -> tuple[bool, str]:
|
||||
"""Check if the parser is properly installed and configured."""
|
||||
@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
show_formula_number: Optional[bool] = None,
|
||||
visualize: Optional[bool] = None,
|
||||
additional_params: Optional[dict[str, Any]] = None,
|
||||
vl_config: Optional[dict[str, Any]] = None,
|
||||
algorithm_config: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> ParseResult:
|
||||
"""Parse PDF document using PaddleOCR API."""
|
||||
@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
config_dict["visualize"] = visualize
|
||||
if additional_params is not None:
|
||||
config_dict["additional_params"] = additional_params
|
||||
if vl_config is not None:
|
||||
config_dict["vl"] = vl_config
|
||||
|
||||
# Add any VL config parameters from kwargs
|
||||
for key, value in kwargs.items():
|
||||
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
|
||||
config_dict[key] = value
|
||||
if algorithm_config is not None:
|
||||
config_dict["algorithm_config"] = algorithm_config
|
||||
|
||||
cfg = PaddleOCRConfig.from_dict(config_dict)
|
||||
|
||||
if not cfg.api_url:
|
||||
raise RuntimeError("[PaddleOCR] API URL missing")
|
||||
|
||||
# Prepare file data
|
||||
# Prepare file data and generate page images for cropping
|
||||
data_bytes = self._prepare_file_data(filepath, binary)
|
||||
|
||||
# Generate page images for cropping functionality
|
||||
input_source = filepath if binary is None else binary
|
||||
try:
|
||||
self.__images__(input_source, callback=callback)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
|
||||
|
||||
# Build and send request
|
||||
result = self._send_request(data_bytes, cfg, callback)
|
||||
|
||||
@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
label = block.get("block_label", "")
|
||||
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
|
||||
|
||||
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
|
||||
tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##"
|
||||
|
||||
if parse_method == "manual":
|
||||
sections.append((block_content, label, tag))
|
||||
@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Convert API response to table tuples."""
|
||||
return []
|
||||
|
||||
def __images__(self, fnm, page_from=0, page_to=100, callback=None):
|
||||
"""Generate page images from PDF for cropping."""
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
except Exception as e:
|
||||
self.page_images = None
|
||||
self.logger.exception(e)
|
||||
|
||||
@staticmethod
|
||||
def extract_positions(txt: str):
|
||||
"""Extract position information from text tags."""
|
||||
poss = []
|
||||
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
||||
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
||||
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||
return poss
|
||||
|
||||
def crop(self, text: str, need_position: bool = False):
|
||||
"""Crop images from PDF based on position tags in text."""
|
||||
imgs = []
|
||||
poss = self.extract_positions(text)
|
||||
|
||||
if not poss:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
if not getattr(self, "page_images", None):
|
||||
self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
page_count = len(self.page_images)
|
||||
|
||||
filtered_poss = []
|
||||
for pns, left, right, top, bottom in poss:
|
||||
if not pns:
|
||||
self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
|
||||
continue
|
||||
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||
if not valid_pns:
|
||||
self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||
continue
|
||||
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||
|
||||
poss = filtered_poss
|
||||
if not poss:
|
||||
self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
first_page_idx = pos[0][0]
|
||||
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
last_page_idx = pos[0][-1]
|
||||
if not (0 <= last_page_idx < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
last_page_height = self.page_images[last_page_idx].size[1]
|
||||
poss.append(
|
||||
(
|
||||
[last_page_idx],
|
||||
pos[1],
|
||||
pos[2],
|
||||
min(last_page_height, pos[4] + GAP),
|
||||
min(last_page_height, pos[4] + 120),
|
||||
)
|
||||
)
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
|
||||
if bottom <= top:
|
||||
bottom = top + 2
|
||||
|
||||
for pn in pns[1:]:
|
||||
if 0 <= pn - 1 < page_count:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
else:
|
||||
self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||
|
||||
if not (0 <= pns[0] < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||
continue
|
||||
|
||||
img0 = self.page_images[pns[0]]
|
||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||
crop0 = img0.crop((x0, y0, x1, y1))
|
||||
imgs.append(crop0)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
||||
|
||||
bottom -= img0.size[1]
|
||||
for pn in pns[1:]:
|
||||
if not (0 <= pn < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||
continue
|
||||
page = self.page_images[pn]
|
||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||
cimgp = page.crop((x0, y0, x1, y1))
|
||||
imgs.append(cimgp)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
||||
bottom -= page.size[1]
|
||||
|
||||
if not imgs:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
height = 0
|
||||
for img in imgs:
|
||||
height += img.size[1] + GAP
|
||||
height = int(height)
|
||||
width = int(np.max([i.size[0] for i in imgs]))
|
||||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
||||
height = 0
|
||||
for ii, img in enumerate(imgs):
|
||||
if ii == 0 or ii + 1 == len(imgs):
|
||||
img = img.convert("RGBA")
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
overlay.putalpha(128)
|
||||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
||||
pic.paste(img, (0, int(height)))
|
||||
height += img.size[1] + GAP
|
||||
|
||||
if need_position:
|
||||
return pic, positions
|
||||
return pic
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
79
docs/faq.mdx
79
docs/faq.mdx
@ -566,3 +566,82 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do
|
||||
:::tip NOTE
|
||||
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
|
||||
:::
|
||||
|
||||
### How to use PaddleOCR for document parsing?
|
||||
|
||||
From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files.
|
||||
|
||||
There are two main ways to configure and use PaddleOCR in RAGFlow:
|
||||
|
||||
#### 1. Using PaddleOCR Official API
|
||||
|
||||
This method uses PaddleOCR's official API service with an access token.
|
||||
|
||||
**Step 1: Configure RAGFlow**
|
||||
- **Via Environment Variables:**
|
||||
```bash
|
||||
# In your docker/.env file:
|
||||
PADDLEOCR_API_URL=https://your-paddleocr-api-endpoint
|
||||
PADDLEOCR_ALGORITHM=PaddleOCR-VL
|
||||
PADDLEOCR_ACCESS_TOKEN=your-access-token-here
|
||||
```
|
||||
|
||||
- **Via UI:**
|
||||
- Navigate to **Model providers** page
|
||||
- Add a new OCR model with factory type "PaddleOCR"
|
||||
- Configure the following fields:
|
||||
- **PaddleOCR API URL**: Your PaddleOCR API endpoint
|
||||
- **PaddleOCR Algorithm**: Select the algorithm corresponding to the API endpoint
|
||||
- **AI Studio Access Token**: Your access token for the PaddleOCR API
|
||||
|
||||
**Step 2: Usage in Dataset Configuration**
|
||||
- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
|
||||
- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
|
||||
- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
|
||||
|
||||
**Notes:**
|
||||
- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr/task), click the **API** button in the upper-left corner, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`.
|
||||
- Access tokens can be obtained from the [AI Studio platform](https://aistudio.baidu.com/account/accessToken).
|
||||
- This method requires internet connectivity to reach the official PaddleOCR API.
|
||||
|
||||
#### 2. Using Self-Hosted PaddleOCR Service
|
||||
|
||||
This method allows you to deploy your own PaddleOCR service and use it without an access token.
|
||||
|
||||
**Step 1: Deploy PaddleOCR Service**
|
||||
Follow the [PaddleOCR serving documentation](https://www.paddleocr.ai/latest/en/version3.x/deployment/serving.html) to deploy your own service. For layout parsing, you can use an endpoint like:
|
||||
|
||||
```bash
|
||||
http://localhost:8080/layout-parsing
|
||||
```
|
||||
|
||||
**Step 2: Configure RAGFlow**
|
||||
- **Via Environment Variables:**
|
||||
```bash
|
||||
PADDLEOCR_API_URL=http://localhost:8080/layout-parsing
|
||||
PADDLEOCR_ALGORITHM=PaddleOCR-VL
|
||||
# No access token required for self-hosted service
|
||||
```
|
||||
|
||||
- **Via UI:**
|
||||
- Navigate to **Model providers** page
|
||||
- Add a new OCR model with factory type "PaddleOCR"
|
||||
- Configure the following fields:
|
||||
- **PaddleOCR API URL**: The endpoint of your deployed service
|
||||
- **PaddleOCR Algorithm**: Select the algorithm corresponding to the deployed service
|
||||
- **AI Studio Access Token**: Leave empty
|
||||
|
||||
**Step 3: Usage in Dataset Configuration**
|
||||
- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
|
||||
- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
|
||||
- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
|
||||
|
||||
#### Environment Variables Summary
|
||||
|
||||
| Environment Variable | Description | Default | Required |
|
||||
|---------------------|-------------|---------|----------|
|
||||
| `PADDLEOCR_API_URL` | PaddleOCR API endpoint URL | `""` | Yes, when using environment variables |
|
||||
| `PADDLEOCR_ALGORITHM` | Algorithm to use for parsing | `"PaddleOCR-VL"` | No |
|
||||
| `PADDLEOCR_ACCESS_TOKEN` | Access token for official API | `None` | Only when using official API |
|
||||
|
||||
Environment variables can be used for auto-provisioning, but are not required if configuring via UI. When environment variables are set, these values are used to auto-provision a PaddleOCR model for the tenant on first use.
|
||||
|
||||
@ -358,48 +358,14 @@ class Parser(ProcessBase):
|
||||
parse_method=conf.get("paddleocr_parse_method", "raw"),
|
||||
)
|
||||
bboxes = []
|
||||
for section in lines:
|
||||
# PaddleOCRParser returns sections as tuple, different formats based on parse_method:
|
||||
# - "raw": (text, position_tag)
|
||||
# - "manual": (text, label, position_tag)
|
||||
# - "paper": (text_with_tag, label)
|
||||
text = section[0]
|
||||
|
||||
# Parse position tag if exists
|
||||
position_tag = ""
|
||||
if len(section) > 1:
|
||||
if len(section) == 2: # raw format: (text, tag)
|
||||
position_tag = section[1]
|
||||
elif len(section) == 3: # manual format: (text, label, tag)
|
||||
position_tag = section[2]
|
||||
elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
|
||||
# paper format: text may contain tag
|
||||
text_with_tag = text
|
||||
import re
|
||||
|
||||
tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
|
||||
if tag_match:
|
||||
position_tag = tag_match.group(1)
|
||||
text = text_with_tag.replace(position_tag, "").strip()
|
||||
|
||||
# Extract coordinate information from position tag
|
||||
page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
|
||||
if position_tag:
|
||||
import re
|
||||
|
||||
tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||
if tag_match:
|
||||
pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
|
||||
page_number = int(pn.split("-")[0]) # Take first page number
|
||||
x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
|
||||
for t, poss in lines:
|
||||
# Get cropped image and positions
|
||||
cropped_image, positions = pdf_parser.crop(poss, need_position=True)
|
||||
|
||||
box = {
|
||||
"text": text,
|
||||
"page_number": page_number,
|
||||
"x0": x0,
|
||||
"x1": x1,
|
||||
"top": top,
|
||||
"bottom": bottom,
|
||||
"text": t,
|
||||
"image": cropped_image,
|
||||
"positions": positions,
|
||||
}
|
||||
bboxes.append(box)
|
||||
else:
|
||||
|
||||
@ -122,7 +122,7 @@ const PaddleOCRModal = ({
|
||||
disabled={loading}
|
||||
className="btn btn-primary"
|
||||
>
|
||||
{loading ? t('common.adding') : t('common.add')}
|
||||
{t('common.add')}
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
Reference in New Issue
Block a user