mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 11:45:10 +08:00
### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
401 lines
15 KiB
Python
401 lines
15 KiB
Python
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import asdict, dataclass, field, fields
|
|
from io import BytesIO
|
|
from os import PathLike
|
|
from pathlib import Path
|
|
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
|
|
|
|
import requests
|
|
|
|
try:
|
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
|
except Exception:
|
|
|
|
class RAGFlowPdfParser:
|
|
pass
|
|
|
|
|
|
AlgorithmType = Literal["PaddleOCR-VL"]
|
|
SectionTuple = tuple[str, ...]
|
|
TableTuple = tuple[str, ...]
|
|
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
|
|
|
|
|
|
_MARKDOWN_IMAGE_PATTERN = re.compile(
|
|
r"""
|
|
<div[^>]*>\s*
|
|
<img[^>]*/>\s*
|
|
</div>
|
|
|
|
|
<img[^>]*/>
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE | re.DOTALL,
|
|
)
|
|
|
|
|
|
def _remove_images_from_markdown(markdown: str) -> str:
|
|
return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
|
|
|
|
|
|
@dataclass
|
|
class PaddleOCRVLConfig:
|
|
"""Configuration for PaddleOCR-VL algorithm."""
|
|
|
|
use_doc_orientation_classify: Optional[bool] = None
|
|
use_doc_unwarping: Optional[bool] = None
|
|
use_layout_detection: Optional[bool] = None
|
|
use_polygon_points: Optional[bool] = None
|
|
use_chart_recognition: Optional[bool] = None
|
|
use_seal_recognition: Optional[bool] = None
|
|
use_ocr_for_image_block: Optional[bool] = None
|
|
layout_threshold: Optional[Union[float, dict]] = None
|
|
layout_nms: Optional[bool] = None
|
|
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
|
|
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
|
|
prompt_label: Optional[str] = None
|
|
format_block_content: Optional[bool] = True
|
|
repetition_penalty: Optional[float] = None
|
|
temperature: Optional[float] = None
|
|
top_p: Optional[float] = None
|
|
min_pixels: Optional[int] = None
|
|
max_pixels: Optional[int] = None
|
|
max_new_tokens: Optional[int] = None
|
|
merge_layout_blocks: Optional[bool] = None
|
|
markdown_ignore_labels: Optional[List[str]] = None
|
|
vlm_extra_args: Optional[dict] = None
|
|
|
|
|
|
@dataclass
|
|
class PaddleOCRConfig:
|
|
"""Main configuration for PaddleOCR parser."""
|
|
|
|
api_url: str = ""
|
|
access_token: Optional[str] = None
|
|
algorithm: AlgorithmType = "PaddleOCR-VL"
|
|
request_timeout: int = 600
|
|
prettify_markdown: bool = True
|
|
show_formula_number: bool = True
|
|
visualize: bool = False
|
|
additional_params: dict[str, Any] = field(default_factory=dict)
|
|
algorithm_config: dict[str, Any] = field(default_factory=dict)
|
|
|
|
@classmethod
|
|
def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
|
|
"""Create configuration from dictionary."""
|
|
if not config:
|
|
return cls()
|
|
|
|
cfg = config.copy()
|
|
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
|
|
|
|
# Validate algorithm
|
|
if algorithm not in ("PaddleOCR-VL",):
|
|
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
|
|
|
# Extract algorithm-specific configuration
|
|
algorithm_config: dict[str, Any] = {}
|
|
if algorithm == "PaddleOCR-VL":
|
|
# Create default PaddleOCRVLConfig object and convert to dict
|
|
algorithm_config = asdict(PaddleOCRVLConfig())
|
|
|
|
# Apply user-provided VL config
|
|
vl_config = cfg.get("vl")
|
|
if isinstance(vl_config, dict):
|
|
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
|
|
|
|
# Remove processed keys
|
|
cfg.pop("vl", None)
|
|
|
|
# Prepare initialization arguments
|
|
field_names = {field.name for field in fields(cls)}
|
|
init_kwargs: dict[str, Any] = {}
|
|
|
|
for field_name in field_names:
|
|
if field_name in cfg:
|
|
init_kwargs[field_name] = cfg[field_name]
|
|
|
|
init_kwargs["algorithm_config"] = algorithm_config
|
|
|
|
return cls(**init_kwargs)
|
|
|
|
@classmethod
|
|
def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
|
|
"""Create configuration from keyword arguments."""
|
|
return cls.from_dict(kwargs)
|
|
|
|
|
|
class PaddleOCRParser(RAGFlowPdfParser):
|
|
"""Parser for PDF documents using PaddleOCR API."""
|
|
|
|
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
|
"prettify_markdown": "prettifyMarkdown",
|
|
"show_formula_number": "showFormulaNumber",
|
|
"visualize": "visualize",
|
|
}
|
|
|
|
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
|
|
"PaddleOCR-VL": {
|
|
"use_doc_orientation_classify": "useDocOrientationClassify",
|
|
"use_doc_unwarping": "useDocUnwarping",
|
|
"use_layout_detection": "useLayoutDetection",
|
|
"use_polygon_points": "usePolygonPoints",
|
|
"use_chart_recognition": "useChartRecognition",
|
|
"use_seal_recognition": "useSealRecognition",
|
|
"use_ocr_for_image_block": "useOcrForImageBlock",
|
|
"layout_threshold": "layoutThreshold",
|
|
"layout_nms": "layoutNms",
|
|
"layout_unclip_ratio": "layoutUnclipRatio",
|
|
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
|
|
"prompt_label": "promptLabel",
|
|
"format_block_content": "formatBlockContent",
|
|
"repetition_penalty": "repetitionPenalty",
|
|
"temperature": "temperature",
|
|
"top_p": "topP",
|
|
"min_pixels": "minPixels",
|
|
"max_pixels": "maxPixels",
|
|
"max_new_tokens": "maxNewTokens",
|
|
"merge_layout_blocks": "mergeLayoutBlocks",
|
|
"markdown_ignore_labels": "markdownIgnoreLabels",
|
|
"vlm_extra_args": "vlmExtraArgs",
|
|
},
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
api_url: Optional[str] = None,
|
|
access_token: Optional[str] = None,
|
|
algorithm: AlgorithmType = "PaddleOCR-VL",
|
|
*,
|
|
request_timeout: int = 600,
|
|
):
|
|
"""Initialize PaddleOCR parser."""
|
|
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
|
|
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
|
self.algorithm = algorithm
|
|
self.request_timeout = request_timeout
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
# Force PDF file type
|
|
self.file_type = 0
|
|
|
|
# Public methods
|
|
def check_installation(self) -> tuple[bool, str]:
|
|
"""Check if the parser is properly installed and configured."""
|
|
if not self.api_url:
|
|
return False, "[PaddleOCR] API URL not configured"
|
|
|
|
# TODO [@Bobholamovic]: Check URL availability and token validity
|
|
|
|
return True, ""
|
|
|
|
def parse_pdf(
|
|
self,
|
|
filepath: str | PathLike[str],
|
|
binary: BytesIO | bytes | None = None,
|
|
callback: Optional[Callable[[float, str], None]] = None,
|
|
*,
|
|
parse_method: str = "raw",
|
|
api_url: Optional[str] = None,
|
|
access_token: Optional[str] = None,
|
|
algorithm: Optional[AlgorithmType] = None,
|
|
request_timeout: Optional[int] = None,
|
|
prettify_markdown: Optional[bool] = None,
|
|
show_formula_number: Optional[bool] = None,
|
|
visualize: Optional[bool] = None,
|
|
additional_params: Optional[dict[str, Any]] = None,
|
|
vl_config: Optional[dict[str, Any]] = None,
|
|
**kwargs: Any,
|
|
) -> ParseResult:
|
|
"""Parse PDF document using PaddleOCR API."""
|
|
# Create configuration - pass all kwargs to capture VL config parameters
|
|
config_dict = {
|
|
"api_url": api_url if api_url is not None else self.api_url,
|
|
"access_token": access_token if access_token is not None else self.access_token,
|
|
"algorithm": algorithm if algorithm is not None else self.algorithm,
|
|
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
|
|
}
|
|
if prettify_markdown is not None:
|
|
config_dict["prettify_markdown"] = prettify_markdown
|
|
if show_formula_number is not None:
|
|
config_dict["show_formula_number"] = show_formula_number
|
|
if visualize is not None:
|
|
config_dict["visualize"] = visualize
|
|
if additional_params is not None:
|
|
config_dict["additional_params"] = additional_params
|
|
if vl_config is not None:
|
|
config_dict["vl"] = vl_config
|
|
|
|
# Add any VL config parameters from kwargs
|
|
for key, value in kwargs.items():
|
|
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
|
|
config_dict[key] = value
|
|
|
|
cfg = PaddleOCRConfig.from_dict(config_dict)
|
|
|
|
if not cfg.api_url:
|
|
raise RuntimeError("[PaddleOCR] API URL missing")
|
|
|
|
# Prepare file data
|
|
data_bytes = self._prepare_file_data(filepath, binary)
|
|
|
|
# Build and send request
|
|
result = self._send_request(data_bytes, cfg, callback)
|
|
|
|
# Process response
|
|
sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
|
|
if callback:
|
|
callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
|
|
|
|
tables = self._transfer_to_tables(result)
|
|
if callback:
|
|
callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
|
|
|
|
return sections, tables
|
|
|
|
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
|
|
"""Prepare file data for API request."""
|
|
source_path = Path(filepath)
|
|
|
|
if binary is not None:
|
|
if isinstance(binary, (bytes, bytearray)):
|
|
return binary
|
|
return binary.getbuffer().tobytes()
|
|
|
|
if not source_path.exists():
|
|
raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
|
|
|
|
return source_path.read_bytes()
|
|
|
|
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
|
|
"""Build payload for API request."""
|
|
payload: dict[str, Any] = {
|
|
"file": base64.b64encode(data).decode("ascii"),
|
|
"fileType": file_type,
|
|
}
|
|
|
|
# Add common parameters
|
|
for param_key, param_value in [
|
|
("prettify_markdown", config.prettify_markdown),
|
|
("show_formula_number", config.show_formula_number),
|
|
("visualize", config.visualize),
|
|
]:
|
|
if param_value is not None:
|
|
api_param = self._COMMON_FIELD_MAPPING[param_key]
|
|
payload[api_param] = param_value
|
|
|
|
# Add algorithm-specific parameters
|
|
algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
|
|
for param_key, param_value in config.algorithm_config.items():
|
|
if param_value is not None and param_key in algorithm_mapping:
|
|
api_param = algorithm_mapping[param_key]
|
|
payload[api_param] = param_value
|
|
|
|
# Add any additional parameters
|
|
if config.additional_params:
|
|
payload.update(config.additional_params)
|
|
|
|
return payload
|
|
|
|
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
|
|
"""Send request to PaddleOCR API and parse response."""
|
|
# Build payload
|
|
payload = self._build_payload(data, self.file_type, config)
|
|
|
|
# Prepare headers
|
|
headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
|
|
if config.access_token:
|
|
headers["Authorization"] = f"token {config.access_token}"
|
|
|
|
self.logger.info("[PaddleOCR] invoking API")
|
|
if callback:
|
|
callback(0.1, "[PaddleOCR] submitting request")
|
|
|
|
# Send request
|
|
try:
|
|
resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
|
|
resp.raise_for_status()
|
|
except Exception as exc:
|
|
if callback:
|
|
callback(-1, f"[PaddleOCR] request failed: {exc}")
|
|
raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
|
|
|
|
# Parse response
|
|
try:
|
|
response_data = resp.json()
|
|
except Exception as exc:
|
|
raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
|
|
|
|
if callback:
|
|
callback(0.8, "[PaddleOCR] response received")
|
|
|
|
# Validate response format
|
|
if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
|
|
if callback:
|
|
callback(-1, "[PaddleOCR] invalid response format")
|
|
raise RuntimeError("[PaddleOCR] invalid response format")
|
|
|
|
return response_data["result"]
|
|
|
|
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
|
|
"""Convert API response to section tuples."""
|
|
sections: list[SectionTuple] = []
|
|
|
|
if algorithm == "PaddleOCR-VL":
|
|
layout_parsing_results = result.get("layoutParsingResults", [])
|
|
|
|
for page_idx, layout_result in enumerate(layout_parsing_results):
|
|
pruned_result = layout_result.get("prunedResult", {})
|
|
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
|
|
|
for block in parsing_res_list:
|
|
block_content = block.get("block_content", "").strip()
|
|
if not block_content:
|
|
continue
|
|
|
|
# Remove images
|
|
block_content = _remove_images_from_markdown(block_content)
|
|
|
|
label = block.get("block_label", "")
|
|
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
|
|
|
|
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
|
|
|
|
if parse_method == "manual":
|
|
sections.append((block_content, label, tag))
|
|
elif parse_method == "paper":
|
|
sections.append((block_content + tag, label))
|
|
else:
|
|
sections.append((block_content, tag))
|
|
|
|
return sections
|
|
|
|
def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
|
|
"""Convert API response to table tuples."""
|
|
return []
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
|
|
ok, reason = parser.check_installation()
|
|
print("PaddleOCR available:", ok, reason)
|