Files
ragflow/deepdoc/parser/paddleocr_parser.py
Lin Manhui 2e09db02f3 feat: add paddleocr parser (#12513)
### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2026-01-09 17:48:45 +08:00

401 lines
15 KiB
Python

# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations
import base64
import logging
import os
import re
from dataclasses import asdict, dataclass, field, fields
from io import BytesIO
from os import PathLike
from pathlib import Path
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
import requests
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
except Exception:
class RAGFlowPdfParser:
pass
AlgorithmType = Literal["PaddleOCR-VL"]
SectionTuple = tuple[str, ...]
TableTuple = tuple[str, ...]
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
_MARKDOWN_IMAGE_PATTERN = re.compile(
r"""
<div[^>]*>\s*
<img[^>]*/>\s*
</div>
|
<img[^>]*/>
""",
re.IGNORECASE | re.VERBOSE | re.DOTALL,
)
def _remove_images_from_markdown(markdown: str) -> str:
return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
@dataclass
class PaddleOCRVLConfig:
"""Configuration for PaddleOCR-VL algorithm."""
use_doc_orientation_classify: Optional[bool] = None
use_doc_unwarping: Optional[bool] = None
use_layout_detection: Optional[bool] = None
use_polygon_points: Optional[bool] = None
use_chart_recognition: Optional[bool] = None
use_seal_recognition: Optional[bool] = None
use_ocr_for_image_block: Optional[bool] = None
layout_threshold: Optional[Union[float, dict]] = None
layout_nms: Optional[bool] = None
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
prompt_label: Optional[str] = None
format_block_content: Optional[bool] = True
repetition_penalty: Optional[float] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
min_pixels: Optional[int] = None
max_pixels: Optional[int] = None
max_new_tokens: Optional[int] = None
merge_layout_blocks: Optional[bool] = None
markdown_ignore_labels: Optional[List[str]] = None
vlm_extra_args: Optional[dict] = None
@dataclass
class PaddleOCRConfig:
"""Main configuration for PaddleOCR parser."""
api_url: str = ""
access_token: Optional[str] = None
algorithm: AlgorithmType = "PaddleOCR-VL"
request_timeout: int = 600
prettify_markdown: bool = True
show_formula_number: bool = True
visualize: bool = False
additional_params: dict[str, Any] = field(default_factory=dict)
algorithm_config: dict[str, Any] = field(default_factory=dict)
@classmethod
def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
"""Create configuration from dictionary."""
if not config:
return cls()
cfg = config.copy()
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
# Validate algorithm
if algorithm not in ("PaddleOCR-VL",):
raise ValueError(f"Unsupported algorithm: {algorithm}")
# Extract algorithm-specific configuration
algorithm_config: dict[str, Any] = {}
if algorithm == "PaddleOCR-VL":
# Create default PaddleOCRVLConfig object and convert to dict
algorithm_config = asdict(PaddleOCRVLConfig())
# Apply user-provided VL config
vl_config = cfg.get("vl")
if isinstance(vl_config, dict):
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
# Remove processed keys
cfg.pop("vl", None)
# Prepare initialization arguments
field_names = {field.name for field in fields(cls)}
init_kwargs: dict[str, Any] = {}
for field_name in field_names:
if field_name in cfg:
init_kwargs[field_name] = cfg[field_name]
init_kwargs["algorithm_config"] = algorithm_config
return cls(**init_kwargs)
@classmethod
def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
"""Create configuration from keyword arguments."""
return cls.from_dict(kwargs)
class PaddleOCRParser(RAGFlowPdfParser):
"""Parser for PDF documents using PaddleOCR API."""
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
"prettify_markdown": "prettifyMarkdown",
"show_formula_number": "showFormulaNumber",
"visualize": "visualize",
}
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
"PaddleOCR-VL": {
"use_doc_orientation_classify": "useDocOrientationClassify",
"use_doc_unwarping": "useDocUnwarping",
"use_layout_detection": "useLayoutDetection",
"use_polygon_points": "usePolygonPoints",
"use_chart_recognition": "useChartRecognition",
"use_seal_recognition": "useSealRecognition",
"use_ocr_for_image_block": "useOcrForImageBlock",
"layout_threshold": "layoutThreshold",
"layout_nms": "layoutNms",
"layout_unclip_ratio": "layoutUnclipRatio",
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
"prompt_label": "promptLabel",
"format_block_content": "formatBlockContent",
"repetition_penalty": "repetitionPenalty",
"temperature": "temperature",
"top_p": "topP",
"min_pixels": "minPixels",
"max_pixels": "maxPixels",
"max_new_tokens": "maxNewTokens",
"merge_layout_blocks": "mergeLayoutBlocks",
"markdown_ignore_labels": "markdownIgnoreLabels",
"vlm_extra_args": "vlmExtraArgs",
},
}
def __init__(
self,
api_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: AlgorithmType = "PaddleOCR-VL",
*,
request_timeout: int = 600,
):
"""Initialize PaddleOCR parser."""
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
self.algorithm = algorithm
self.request_timeout = request_timeout
self.logger = logging.getLogger(self.__class__.__name__)
# Force PDF file type
self.file_type = 0
# Public methods
def check_installation(self) -> tuple[bool, str]:
"""Check if the parser is properly installed and configured."""
if not self.api_url:
return False, "[PaddleOCR] API URL not configured"
# TODO [@Bobholamovic]: Check URL availability and token validity
return True, ""
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes | None = None,
callback: Optional[Callable[[float, str], None]] = None,
*,
parse_method: str = "raw",
api_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: Optional[AlgorithmType] = None,
request_timeout: Optional[int] = None,
prettify_markdown: Optional[bool] = None,
show_formula_number: Optional[bool] = None,
visualize: Optional[bool] = None,
additional_params: Optional[dict[str, Any]] = None,
vl_config: Optional[dict[str, Any]] = None,
**kwargs: Any,
) -> ParseResult:
"""Parse PDF document using PaddleOCR API."""
# Create configuration - pass all kwargs to capture VL config parameters
config_dict = {
"api_url": api_url if api_url is not None else self.api_url,
"access_token": access_token if access_token is not None else self.access_token,
"algorithm": algorithm if algorithm is not None else self.algorithm,
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
}
if prettify_markdown is not None:
config_dict["prettify_markdown"] = prettify_markdown
if show_formula_number is not None:
config_dict["show_formula_number"] = show_formula_number
if visualize is not None:
config_dict["visualize"] = visualize
if additional_params is not None:
config_dict["additional_params"] = additional_params
if vl_config is not None:
config_dict["vl"] = vl_config
# Add any VL config parameters from kwargs
for key, value in kwargs.items():
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
config_dict[key] = value
cfg = PaddleOCRConfig.from_dict(config_dict)
if not cfg.api_url:
raise RuntimeError("[PaddleOCR] API URL missing")
# Prepare file data
data_bytes = self._prepare_file_data(filepath, binary)
# Build and send request
result = self._send_request(data_bytes, cfg, callback)
# Process response
sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
if callback:
callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
tables = self._transfer_to_tables(result)
if callback:
callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
return sections, tables
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
"""Prepare file data for API request."""
source_path = Path(filepath)
if binary is not None:
if isinstance(binary, (bytes, bytearray)):
return binary
return binary.getbuffer().tobytes()
if not source_path.exists():
raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
return source_path.read_bytes()
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
"""Build payload for API request."""
payload: dict[str, Any] = {
"file": base64.b64encode(data).decode("ascii"),
"fileType": file_type,
}
# Add common parameters
for param_key, param_value in [
("prettify_markdown", config.prettify_markdown),
("show_formula_number", config.show_formula_number),
("visualize", config.visualize),
]:
if param_value is not None:
api_param = self._COMMON_FIELD_MAPPING[param_key]
payload[api_param] = param_value
# Add algorithm-specific parameters
algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
for param_key, param_value in config.algorithm_config.items():
if param_value is not None and param_key in algorithm_mapping:
api_param = algorithm_mapping[param_key]
payload[api_param] = param_value
# Add any additional parameters
if config.additional_params:
payload.update(config.additional_params)
return payload
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
"""Send request to PaddleOCR API and parse response."""
# Build payload
payload = self._build_payload(data, self.file_type, config)
# Prepare headers
headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
if config.access_token:
headers["Authorization"] = f"token {config.access_token}"
self.logger.info("[PaddleOCR] invoking API")
if callback:
callback(0.1, "[PaddleOCR] submitting request")
# Send request
try:
resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
resp.raise_for_status()
except Exception as exc:
if callback:
callback(-1, f"[PaddleOCR] request failed: {exc}")
raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
# Parse response
try:
response_data = resp.json()
except Exception as exc:
raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
if callback:
callback(0.8, "[PaddleOCR] response received")
# Validate response format
if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
if callback:
callback(-1, "[PaddleOCR] invalid response format")
raise RuntimeError("[PaddleOCR] invalid response format")
return response_data["result"]
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
"""Convert API response to section tuples."""
sections: list[SectionTuple] = []
if algorithm == "PaddleOCR-VL":
layout_parsing_results = result.get("layoutParsingResults", [])
for page_idx, layout_result in enumerate(layout_parsing_results):
pruned_result = layout_result.get("prunedResult", {})
parsing_res_list = pruned_result.get("parsing_res_list", [])
for block in parsing_res_list:
block_content = block.get("block_content", "").strip()
if not block_content:
continue
# Remove images
block_content = _remove_images_from_markdown(block_content)
label = block.get("block_label", "")
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
if parse_method == "manual":
sections.append((block_content, label, tag))
elif parse_method == "paper":
sections.append((block_content + tag, label))
else:
sections.append((block_content, tag))
return sections
def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
"""Convert API response to table tuples."""
return []
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
ok, reason = parser.check_installation()
print("PaddleOCR available:", ok, reason)